From b7b9dc0b35a836dedb02b109c9d4a79a0db2aaca Mon Sep 17 00:00:00 2001
From: Gwen <grvosku@sandia.gov>
Date: Mon, 26 Oct 2020 15:19:38 -0600
Subject: [PATCH 001/133] Fix for bugs in lazy write handling

---
 src/gpgpu-sim/gpu-cache.cc |  5 ++++-
 src/gpgpu-sim/gpu-cache.h  | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 75c369136..d44c959b3 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1455,16 +1455,19 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
-  block->set_status(MODIFIED, mf->get_access_sector_mask());
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
+  } else {
+    block->set_status(MODIFIED, mf->get_access_sector_mask());
   }
 
   if (mf->get_access_byte_mask().count() == m_config.get_atom_sz()) {
     block->set_m_readable(true, mf->get_access_sector_mask());
   } else {
     block->set_m_readable(false, mf->get_access_sector_mask());
+    if (m_status == HIT_RESERVED)
+        block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
 
   if (m_status != RESERVATION_FAIL) {
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 5c28b41f6..25d0b7826 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -128,6 +128,8 @@ struct cache_block_t {
                                   mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_modified_on_fill(bool m_modified,
                                     mem_access_sector_mask_t sector_mask) = 0;
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) = 0;
   virtual unsigned get_modified_size() = 0;
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) = 0;
@@ -147,6 +149,7 @@ struct line_cache_block : public cache_block_t {
     m_status = INVALID;
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
+    m_set_readable_on_fill = false;
     m_readable = true;
   }
   void allocate(new_addr_type tag, new_addr_type block_addr, unsigned time,
@@ -159,12 +162,16 @@ struct line_cache_block : public cache_block_t {
     m_status = RESERVED;
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
+    m_set_readable_on_fill = false;
   }
   void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
     // if(!m_ignore_on_fill_status)
     //	assert( m_status == RESERVED );
 
     m_status = m_set_modified_on_fill ? MODIFIED : VALID;
+    
+    if (m_set_readable_on_fill)
+        m_readable = true;
 
     m_fill_time = time;
   }
@@ -197,6 +204,10 @@ struct line_cache_block : public cache_block_t {
                                     mem_access_sector_mask_t sector_mask) {
     m_set_modified_on_fill = m_modified;
   }
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) {
+    m_set_readable_on_fill = readable;
+  }
   virtual unsigned get_modified_size() {
     return SECTOR_CHUNCK_SIZE * SECTOR_SIZE;  // i.e. cache line size
   }
@@ -218,6 +229,7 @@ struct line_cache_block : public cache_block_t {
   cache_block_state m_status;
   bool m_ignore_on_fill_status;
   bool m_set_modified_on_fill;
+  bool m_set_readable_on_fill;
   bool m_readable;
 };
 
@@ -232,6 +244,7 @@ struct sector_cache_block : public cache_block_t {
       m_status[i] = INVALID;
       m_ignore_on_fill_status[i] = false;
       m_set_modified_on_fill[i] = false;
+      m_set_readable_on_fill[i] = false;
       m_readable[i] = true;
     }
     m_line_alloc_time = 0;
@@ -261,6 +274,7 @@ struct sector_cache_block : public cache_block_t {
     m_status[sidx] = RESERVED;
     m_ignore_on_fill_status[sidx] = false;
     m_set_modified_on_fill[sidx] = false;
+    m_set_readable_on_fill[sidx] = false;
 
     // set line stats
     m_line_alloc_time = time;  // only set this for the first allocated sector
@@ -283,6 +297,8 @@ struct sector_cache_block : public cache_block_t {
     else
       m_set_modified_on_fill[sidx] = false;
 
+    m_set_readable_on_fill[sidx] = false;
+
     m_status[sidx] = RESERVED;
     m_ignore_on_fill_status[sidx] = false;
     // m_set_modified_on_fill[sidx] = false;
@@ -300,6 +316,11 @@ struct sector_cache_block : public cache_block_t {
     //	         assert( m_status[sidx] == RESERVED );
 
     m_status[sidx] = m_set_modified_on_fill[sidx] ? MODIFIED : VALID;
+    
+    if (m_set_readable_on_fill[sidx]) {
+        m_readable[sidx] = true;
+        m_set_readable_on_fill[sidx] = false;
+    }
 
     m_sector_fill_time[sidx] = time;
     m_line_fill_time = time;
@@ -366,6 +387,11 @@ struct sector_cache_block : public cache_block_t {
     m_set_modified_on_fill[sidx] = m_modified;
   }
 
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) {
+    unsigned sidx = get_sector_index(sector_mask);
+    m_set_readable_on_fill[sidx] = readable;
+  }
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) {
     unsigned sidx = get_sector_index(sector_mask);
@@ -400,6 +426,7 @@ struct sector_cache_block : public cache_block_t {
   cache_block_state m_status[SECTOR_CHUNCK_SIZE];
   bool m_ignore_on_fill_status[SECTOR_CHUNCK_SIZE];
   bool m_set_modified_on_fill[SECTOR_CHUNCK_SIZE];
+  bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_readable[SECTOR_CHUNCK_SIZE];
 
   unsigned get_sector_index(mem_access_sector_mask_t sector_mask) {

From 950464e7f8e512f2beb0c9e0883db3489bf84cec Mon Sep 17 00:00:00 2001
From: allen <allencho1222@gmail.com>
Date: Mon, 9 Nov 2020 21:43:08 +0900
Subject: [PATCH 002/133] change address type into ull

---
 src/abstract_hardware_model.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 49f3e9f90..c012de0d8 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -75,8 +75,8 @@ enum AdaptiveCache { FIXED = 0, ADAPTIVE_VOLTA = 1 };
 
 typedef unsigned long long new_addr_type;
 typedef unsigned long long cudaTextureObject_t;
-typedef unsigned address_type;
-typedef unsigned addr_t;
+typedef unsigned long long address_type;
+typedef unsigned long long addr_t;
 
 // the following are operations the timing model can see
 #define SPECIALIZED_UNIT_NUM 8

From 07f77e1c3d1f1222de21bb77e4dcc5a6ab94a90f Mon Sep 17 00:00:00 2001
From: allen <allencho1222@gmail.com>
Date: Mon, 9 Nov 2020 21:46:01 +0900
Subject: [PATCH 003/133] do not truncate 32 MSB bits of the memory address

---
 src/abstract_hardware_model.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 5ad6f105d..e0e1d23cf 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -205,8 +205,8 @@ gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)
   gpu_tot_sim_cycle = 0;
 }
 
-address_type line_size_based_tag_func(new_addr_type address,
-                                      new_addr_type line_size) {
+new_addr_type line_size_based_tag_func(new_addr_type address,
+                                       new_addr_type line_size) {
   // gives the tag for an address based on a given line size
   return address & ~(line_size - 1);
 }
@@ -448,7 +448,7 @@ void warp_inst_t::generate_mem_accesses() {
     for (unsigned thread = 0; thread < m_config->warp_size; thread++) {
       if (!active(thread)) continue;
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      unsigned block_address = line_size_based_tag_func(addr, cache_block_size);
+      new_addr_type block_address = line_size_based_tag_func(addr, cache_block_size);
       accesses[block_address].set(thread);
       unsigned idx = addr - block_address;
       for (unsigned i = 0; i < data_size; i++) byte_mask.set(idx + i);
@@ -530,7 +530,7 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
            (m_per_scalar_thread[thread].memreqaddr[access] != 0);
            access++) {
         new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[access];
-        unsigned block_address = line_size_based_tag_func(addr, segment_size);
+        new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
         unsigned chunk =
             (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte
                                 // chunk does this thread access?
@@ -552,7 +552,7 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
         if (block_address != line_size_based_tag_func(
                                  addr + data_size_coales - 1, segment_size)) {
           addr = addr + data_size_coales - 1;
-          unsigned block_address = line_size_based_tag_func(addr, segment_size);
+          new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
           unsigned chunk = (addr & 127) / 32;
           transaction_info &info = subwarp_transactions[block_address];
           info.chunks.set(chunk);
@@ -625,7 +625,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       if (!active(thread)) continue;
 
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      unsigned block_address = line_size_based_tag_func(addr, segment_size);
+      new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
       unsigned chunk =
           (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte chunk
                               // does this thread access?

From 132c2ce4ef3ff12f984881ca4b6a8780797dacff Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Sun, 15 Nov 2020 15:41:39 -0500
Subject: [PATCH 004/133] added MSHR_HIT

---
 src/gpgpu-sim/gpu-cache.cc | 3 ++-
 src/gpgpu-sim/gpu-cache.h  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 75c369136..613403a49 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -37,7 +37,7 @@
 
 const char *cache_request_status_str(enum cache_request_status status) {
   static const char *static_cache_request_status_str[] = {
-      "HIT", "HIT_RESERVED", "MISS", "RESERVATION_FAIL", "SECTOR_MISS"};
+      "HIT", "HIT_RESERVED", "MISS", "RESERVATION_FAIL", "SECTOR_MISS", "MSHR_HIT"};
 
   assert(sizeof(static_cache_request_status_str) / sizeof(const char *) ==
          NUM_CACHE_REQUEST_STATUS);
@@ -1123,6 +1123,7 @@ void baseline_cache::send_read_request(new_addr_type addr,
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
 
     m_mshrs.add(mshr_addr, mf);
+    m_stats.inc_stats(mf->get_access_type(), MSHR_HIT);
     do_miss = true;
 
   } else if (!mshr_hit && mshr_avail &&
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 5c28b41f6..17c8c02d8 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -49,6 +49,7 @@ enum cache_request_status {
   MISS,
   RESERVATION_FAIL,
   SECTOR_MISS,
+  MSHR_HIT,
   NUM_CACHE_REQUEST_STATUS
 };
 

From f3a00778b98cf101c8052e9fe1dd2d4c08185b7e Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Fri, 12 Feb 2021 16:13:46 -0500
Subject: [PATCH 005/133] bug fix was_writeback_sent

---
 src/gpgpu-sim/gpu-cache.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index af22c4c2c..eb9500485 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -496,8 +496,10 @@ bool was_writeback_sent(const std::list<cache_event> &events,
                         cache_event &wb_event) {
   for (std::list<cache_event>::const_iterator e = events.begin();
        e != events.end(); e++) {
-    if ((*e).m_cache_event_type == WRITE_BACK_REQUEST_SENT) wb_event = *e;
-    return true;
+    if ((*e).m_cache_event_type == WRITE_BACK_REQUEST_SENT) {
+      wb_event = *e;
+      return true;
+    }
   }
   return false;
 }

From 51d99259845a051a32e45763bdf3005b4dff74b5 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 15 Feb 2021 16:03:51 -0500
Subject: [PATCH 006/133] fix hash funciton

---
 src/gpgpu-sim/gpu-cache.cc | 4 ++--
 src/gpgpu-sim/gpu-cache.h  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 8f7ccd591..1c36d224c 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -63,8 +63,8 @@ unsigned l1d_cache_config::set_bank(new_addr_type addr) const {
   // For sector cache, we select one sector per bank (sector interleaving)
   // This is what was found in Volta (one sector per bank, sector interleaving)
   // otherwise, line interleaving
-  return cache_config::hash_function(addr, l1_banks, l1_banks_byte_interleaving,
-                                     m_l1_banks_log2,
+  return cache_config::hash_function(addr, l1_banks, l1_banks_byte_interleaving_log2,
+                                     l1_banks_log2,
                                      l1_banks_hashing_function);
 }
 
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 26369c33a..00c09ae55 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -817,15 +817,15 @@ class l1d_cache_config : public cache_config {
   l1d_cache_config() : cache_config() {}
   unsigned set_bank(new_addr_type addr) const;
   void init(char *config, FuncCache status) {
-    m_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
-    m_l1_banks_log2 = LOGB2(l1_banks);
+    l1_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
+    l1_banks_log2 = LOGB2(l1_banks);
     cache_config::init(config, status);
   }
   unsigned l1_latency;
   unsigned l1_banks;
-  unsigned m_l1_banks_log2;
+  unsigned l1_banks_log2;
   unsigned l1_banks_byte_interleaving;
-  unsigned m_banks_byte_interleaving_log2;
+  unsigned l1_banks_byte_interleaving_log2;
   unsigned l1_banks_hashing_function;
 };
 

From b430b36911b48228ed7eb77457cc378261151a13 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Thu, 25 Feb 2021 16:25:43 -0500
Subject: [PATCH 007/133] adding new RTX 3070 config

---
 .../SM86_RTX3070/config_ampere_islip.icnt     |  74 +++++++
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  | 192 ++++++++++++++++++
 2 files changed, 266 insertions(+)
 create mode 100644 configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
 create mode 100644 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config

diff --git a/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt b/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
new file mode 100644
index 000000000..6775d5d6f
--- /dev/null
+++ b/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 78;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
new file mode 100644
index 000000000..2010aa698
--- /dev/null
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -0,0 +1,192 @@
+# This config models the Ampere RTX 3070
+# For more info about Ampere architecture:
+# https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
+# https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
+# https://en.wikipedia.org/wiki/GeForce_30_series
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 86 
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 8
+-gpgpu_compute_capability_minor 6
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 46
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2
+
+# Ampere clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1320.0:1320.0:1320.0:3500.0
+# boost mode
+# -gpgpu_clock_domains 1780.0:1780.0:1780.0:3500.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 86
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Ampere GA102 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 32
+-ptx_opcode_initiation_tensor 32
+
+# Ampere has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# Ampere has 24 double-ported banks, 4 schedulers, 6 banks per scheduler
+-gpgpu_num_reg_banks 24
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 86
+
+# Ampere has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler gto
+## In Ampere, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Default config is 28KB DL1 and 100KB shared memory
+# In Ampere, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-8-x
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+# Ampere unified cache has four banks
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_shmem_size 102400
+-gpgpu_shmem_sizeDefault 102400
+-gpgpu_shmem_per_block 102400
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 20
+-gpgpu_smem_latency 20
+-gpgpu_flush_l1_cache 1
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 3MB L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprecated, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_ampere_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# Ampere RTX3060 has GDDR6
+# http://monitorinsider.com/GDDR6.html
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Use the same GDDR5 timing, scaled to 3500MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
+                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model for Ampere
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+

From 09f10eb4c28b6cc76c7c7cc3181c340cf8ec2be5 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Thu, 25 Mar 2021 12:44:09 -0400
Subject: [PATCH 008/133] change the L1 cache policy to be on-miss based on
 recent ubench

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 6fe04eecd..8be9a73d2 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -101,7 +101,7 @@
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 -gpgpu_adaptive_cache_config 0
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:256:8,16:0,32
 -gpgpu_shmem_size 65536
 -gpgpu_shmem_sizeDefault 65536
 -gpgpu_shmem_per_block 65536
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index c4818d10f..18f55641d 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -115,7 +115,7 @@
 -gpgpu_adaptive_cache_config 1
 # Volta unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:256:8,16:0,32
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 2010aa698..11dbcaf1c 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -109,7 +109,7 @@
 -gpgpu_adaptive_cache_config 1
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:256:8,16:0,32
 -gpgpu_shmem_size 102400
 -gpgpu_shmem_sizeDefault 102400
 -gpgpu_shmem_per_block 102400

From 1ee03f0116511ac3c2d6ac7688d916191f4f0a6b Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Thu, 25 Mar 2021 12:54:14 -0400
Subject: [PATCH 009/133] change the L1 cache policy based on recent ubench

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 8be9a73d2..6189dca0f 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -101,7 +101,7 @@
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 -gpgpu_adaptive_cache_config 0
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 65536
 -gpgpu_shmem_sizeDefault 65536
 -gpgpu_shmem_per_block 65536
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 18f55641d..bc5677cf3 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -115,7 +115,7 @@
 -gpgpu_adaptive_cache_config 1
 # Volta unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 11dbcaf1c..f5418ad8e 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -109,7 +109,7 @@
 -gpgpu_adaptive_cache_config 1
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 102400
 -gpgpu_shmem_sizeDefault 102400
 -gpgpu_shmem_per_block 102400

From 553346445486367799d4d67bf3537e54b7c83859 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Sun, 9 May 2021 13:11:39 -0400
Subject: [PATCH 010/133] parition CU allocation, add prints

---
 src/abstract_hardware_model.h | 12 +++++++++++-
 src/gpgpu-sim/shader.cc       | 25 +++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index c012de0d8..636052ad7 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1315,7 +1315,17 @@ class register_set {
     }
     return false;
   }
-
+  unsigned get_ready_reg_id() {
+    // for sub core model we need to figure which reg_id has the ready warp
+    // this function should only be called if has_ready() was true
+    assert(has_ready());
+    for (unsigned i = 0; i < regs.size(); i++) {
+      if (not regs[i]->empty()) {
+        return i;
+      }
+    }
+    abort();
+  }
   void move_in(warp_inst_t *&src) {
     warp_inst_t **free = get_free();
     move_warp(*free, src);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c6e7b8f67..40120ec9c 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3974,7 +3974,18 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
       for (unsigned j = 0; j < inp.m_cu_sets.size(); j++) {
         std::vector<collector_unit_t> &cu_set = m_cus[inp.m_cu_sets[j]];
         bool allocated = false;
-        for (unsigned k = 0; k < cu_set.size(); k++) {
+        unsigned cuLowerBound = 0;
+        unsigned cuUpperBound = cu_set.size();
+        if(sub_core_model) {
+          // Sub core model only allocates on the subset of CUs assigned to the scheduler that issued
+          unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
+          assert(cu_set.size() % m_num_warp_scheds == 0);
+          unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
+          cuLowerBound = reg_id * cusPerSched;
+          cuUpperBound = cuLowerBound + cusPerSched;
+          assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
+        }
+        for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
@@ -3984,7 +3995,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         if (allocated) break;  // cu has been allocated, no need to search more.
       }
-      break;  // can only service a single input, if it failed it will fail for
+      //break;  // can only service a single input, if it failed it will fail for
               // others.
     }
   }
@@ -4098,6 +4109,16 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
   // move_warp(*m_output_register,m_warp);
+  // Print out which OC dispatched which warp sched id to which exec pipeline
+  std::cout << "Dispatched from OC: "
+  << this->get_id()
+  << "\t Warp_id: "
+  << m_warp->get_uid()
+  << "\t Sched_id: "
+  << m_warp->get_schd_id()
+  << "\tto execution register: "
+  << m_output_register->get_name()
+  << std::endl;
   m_output_register->move_in(m_warp);
   m_free = true;
   m_output_register = NULL;

From 645a0eaa6b431c5d4279330c72905ac6b6e7abb2 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Sun, 9 May 2021 13:23:12 -0400
Subject: [PATCH 011/133] minor fixes

---
 src/abstract_hardware_model.h | 1 +
 src/gpgpu-sim/shader.cc       | 2 +-
 src/gpgpu-sim/shader.h        | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 636052ad7..4d2bb4c4b 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1291,6 +1291,7 @@ class register_set {
     }
     m_name = name;
   }
+  const char * get_name() {return m_name;}
   bool has_free() {
     for (unsigned i = 0; i < regs.size(); i++) {
       if (regs[i]->empty()) {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 40120ec9c..372bc128a 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3867,7 +3867,7 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   assert((m_bank_warp_shift == 5) || (m_warp_size != 32));
 
   sub_core_model = shader->get_config()->sub_core_model;
-  m_num_warp_sceds = shader->get_config()->gpgpu_num_sched_per_core;
+  m_num_warp_scheds = shader->get_config()->gpgpu_num_sched_per_core;
   if (sub_core_model)
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
   m_num_banks_per_sched =
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 6481790bc..05c0e4c93 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -947,7 +947,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
   arbiter_t m_arbiter;
 
   unsigned m_num_banks_per_sched;
-  unsigned m_num_warp_sceds;
+  unsigned m_num_warp_scheds;
   bool sub_core_model;
 
   // unsigned m_num_ports;

From 46423a22b7c11663e4849dbd3bb77f2d530f6907 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Sun, 9 May 2021 14:07:05 -0400
Subject: [PATCH 012/133] useful print statement

---
 src/gpgpu-sim/shader.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 372bc128a..895a2ef84 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3983,10 +3983,12 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
           cuLowerBound = reg_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
+          std::cout << "reg_id: " << reg_id << " cusPerSched: " << cusPerSched << " lowerBound: " << cuLowerBound << std::endl;
           assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
+            std::cout << "Allocated on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);

From b67288046af824a88f8bb94541ded14cc711ef35 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Sun, 9 May 2021 14:42:29 -0400
Subject: [PATCH 013/133] validated collector unit partitioning based on
 scheduler

---
 src/abstract_hardware_model.h | 16 ++++++++++++++--
 src/gpgpu-sim/shader.cc       |  8 +++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 4d2bb4c4b..ba32358b7 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1320,12 +1320,24 @@ class register_set {
     // for sub core model we need to figure which reg_id has the ready warp
     // this function should only be called if has_ready() was true
     assert(has_ready());
+    warp_inst_t **ready;
+    ready = NULL;
+    unsigned reg_id;
     for (unsigned i = 0; i < regs.size(); i++) {
       if (not regs[i]->empty()) {
-        return i;
+        if (ready and (*ready)->get_uid() < regs[i]->get_uid()) {
+          // ready is oldest
+        } else {
+          ready = &regs[i];
+          reg_id = i;
+        }
       }
     }
-    abort();
+    return reg_id;
+  }
+  unsigned get_schd_id(unsigned reg_id) {
+      assert(not regs[reg_id]->empty());
+      return regs[reg_id]->get_schd_id();
   }
   void move_in(warp_inst_t *&src) {
     warp_inst_t **free = get_free();
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 895a2ef84..5c27b9b5e 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3976,19 +3976,21 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         bool allocated = false;
         unsigned cuLowerBound = 0;
         unsigned cuUpperBound = cu_set.size();
+        unsigned schd_id;
         if(sub_core_model) {
           // Sub core model only allocates on the subset of CUs assigned to the scheduler that issued
           unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
+          schd_id = (*inp.m_in[i]).get_schd_id(reg_id);
           assert(cu_set.size() % m_num_warp_scheds == 0);
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
-          cuLowerBound = reg_id * cusPerSched;
+          cuLowerBound = schd_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
-          std::cout << "reg_id: " << reg_id << " cusPerSched: " << cusPerSched << " lowerBound: " << cuLowerBound << std::endl;
+          std::cout << "reg_id: " << reg_id << " schd_id: " << schd_id << " cusPerSched: " << cusPerSched << " lowerBound: " << cuLowerBound << std::endl;
           assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            std::cout << "Allocated on cu: " << k << std::endl;
+            std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);

From fa76ab438b0b8c2d2e8abf5f395c7a98a3d5fd9b Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 15:05:06 -0400
Subject: [PATCH 014/133] sub core model dispatches only to assigned exec
 pipelines

---
 src/abstract_hardware_model.h | 11 +++++++++++
 src/gpgpu-sim/shader.cc       | 17 ++++++++++-------
 src/gpgpu-sim/shader.h        | 10 ++++++----
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index ba32358b7..d70c3ebc3 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1346,6 +1346,17 @@ class register_set {
   // void copy_in( warp_inst_t* src ){
   //   src->copy_contents_to(*get_free());
   //}
+  void move_in(bool sub_core_model, unsigned reg_id, warp_inst_t *&src) {
+  warp_inst_t **free;
+  if (!sub_core_model) {
+    free = get_free();
+  } else {
+    assert(reg_id < regs.size());
+    free = get_free(sub_core_model, reg_id);
+  }
+  move_warp(*free, src);
+  }
+
   void move_out_to(warp_inst_t *&dest) {
     warp_inst_t **ready = get_ready();
     move_warp(dest, *ready);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 5c27b9b5e..ec1073334 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3939,7 +3939,7 @@ bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
 void opndcoll_rfu_t::dispatch_ready_cu() {
   for (unsigned p = 0; p < m_dispatch_units.size(); ++p) {
     dispatch_unit_t &du = m_dispatch_units[p];
-    collector_unit_t *cu = du.find_ready();
+    collector_unit_t *cu = du.find_ready(sub_core_model, p);
     if (cu) {
       for (unsigned i = 0; i < (cu->get_num_operands() - cu->get_num_regs());
            i++) {
@@ -3961,7 +3961,9 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
               m_shader->get_config()->warp_size);  // cu->get_active_count());
         }
       }
-      cu->dispatch();
+      unsigned cusPerSched = du->get_num_collectors() / m_num_warp_scheds;
+      unsigned reg_id = p / cusPerSched;
+      cu->dispatch(sub_core_model, reg_id);
     }
   }
 }
@@ -3985,7 +3987,6 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
           cuLowerBound = schd_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
-          std::cout << "reg_id: " << reg_id << " schd_id: " << schd_id << " cusPerSched: " << cusPerSched << " lowerBound: " << cuLowerBound << std::endl;
           assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
@@ -4046,8 +4047,8 @@ void opndcoll_rfu_t::allocate_reads() {
   }
 }
 
-bool opndcoll_rfu_t::collector_unit_t::ready() const {
-  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free();
+bool opndcoll_rfu_t::collector_unit_t::ready(bool sub_core_model, unsigned reg_id) const {
+  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free(sub_core_model, reg_id);
 }
 
 void opndcoll_rfu_t::collector_unit_t::dump(
@@ -4110,7 +4111,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
   return false;
 }
 
-void opndcoll_rfu_t::collector_unit_t::dispatch() {
+void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned reg_id) {
   assert(m_not_ready.none());
   // move_warp(*m_output_register,m_warp);
   // Print out which OC dispatched which warp sched id to which exec pipeline
@@ -4122,8 +4123,10 @@ void opndcoll_rfu_t::collector_unit_t::dispatch() {
   << m_warp->get_schd_id()
   << "\tto execution register: "
   << m_output_register->get_name()
+  << "\treg id: "
+  << reg_id
   << std::endl;
-  m_output_register->move_in(m_warp);
+  m_output_register->move_in(sub_core_model, reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;
   for (unsigned i = 0; i < MAX_REG_OPERANDS * 2; i++) m_src_op[i].reset();
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 05c0e4c93..74bf32093 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -867,7 +867,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_bank_warp_shift = 0;
     }
     // accessors
-    bool ready() const;
+    bool ready(bool sub_core_modle, unsigned reg_id) const;
     const op_t *get_operands() const { return m_src_op; }
     void dump(FILE *fp, const shader_core_ctx *shader) const;
 
@@ -888,7 +888,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
     unsigned get_num_operands() const { return m_warp->get_num_operands(); }
     unsigned get_num_regs() const { return m_warp->get_num_regs(); }
-    void dispatch();
+    void dispatch(bool sub_core_model, unsigned reg_id);
     bool is_free() { return m_free; }
 
    private:
@@ -917,10 +917,10 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_next_cu = 0;
     }
 
-    collector_unit_t *find_ready() {
+    collector_unit_t *find_ready(bool sub_core_model, unsigned reg_id) {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        if ((*m_collector_units)[c].ready()) {
+        if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
         }
@@ -928,6 +928,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       return NULL;
     }
 
+    unsigned get_num_collectors(){return m_num_collectors;}
+    
    private:
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;

From c905726ae9921e6ba67df77fd4ba5bb87215d69d Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 15:08:28 -0400
Subject: [PATCH 015/133] minor fix accessing du

---
 src/gpgpu-sim/shader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index ec1073334..c3b8d3949 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3961,7 +3961,7 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
               m_shader->get_config()->warp_size);  // cu->get_active_count());
         }
       }
-      unsigned cusPerSched = du->get_num_collectors() / m_num_warp_scheds;
+      unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
       unsigned reg_id = p / cusPerSched;
       cu->dispatch(sub_core_model, reg_id);
     }

From a72b84e0f6e90754728d0309aac5dca1e00b7874 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 18:50:34 -0400
Subject: [PATCH 016/133] fix find_ready reg_id

---
 src/gpgpu-sim/shader.cc | 2 +-
 src/gpgpu-sim/shader.h  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c3b8d3949..d9d441149 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3962,7 +3962,7 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
         }
       }
       unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
-      unsigned reg_id = p / cusPerSched;
+      unsigned reg_id = cu->get_id() / cusPerSched;
       cu->dispatch(sub_core_model, reg_id);
     }
   }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 74bf32093..9b14bfdc5 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -917,9 +917,10 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_next_cu = 0;
     }
 
-    collector_unit_t *find_ready(bool sub_core_model, unsigned reg_id) {
+    collector_unit_t *find_ready(bool sub_core_model) {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
+        unsigned reg_id = c / m_num_collectors;
         if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
@@ -929,7 +930,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     }
 
     unsigned get_num_collectors(){return m_num_collectors;}
-    
+
    private:
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;

From 6ad5bad1d992e1add154957ac4903ce17007b912 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 19:34:48 -0400
Subject: [PATCH 017/133] dont need du id

---
 src/gpgpu-sim/shader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index d9d441149..943e38c7d 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3939,7 +3939,7 @@ bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
 void opndcoll_rfu_t::dispatch_ready_cu() {
   for (unsigned p = 0; p < m_dispatch_units.size(); ++p) {
     dispatch_unit_t &du = m_dispatch_units[p];
-    collector_unit_t *cu = du.find_ready(sub_core_model, p);
+    collector_unit_t *cu = du.find_ready(sub_core_model);
     if (cu) {
       for (unsigned i = 0; i < (cu->get_num_operands() - cu->get_num_regs());
            i++) {

From 92192368f2545cd6fc1004047af8b57762637dbf Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 19:40:46 -0400
Subject: [PATCH 018/133] remove prints

---
 src/gpgpu-sim/shader.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 943e38c7d..928e1083a 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3991,7 +3991,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
+            //std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);
@@ -4113,9 +4113,8 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 
 void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned reg_id) {
   assert(m_not_ready.none());
-  // move_warp(*m_output_register,m_warp);
   // Print out which OC dispatched which warp sched id to which exec pipeline
-  std::cout << "Dispatched from OC: "
+  /* std::cout << "Dispatched from OC: "
   << this->get_id()
   << "\t Warp_id: "
   << m_warp->get_uid()
@@ -4125,7 +4124,7 @@ void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned re
   << m_output_register->get_name()
   << "\treg id: "
   << reg_id
-  << std::endl;
+  << std::endl; */
   m_output_register->move_in(sub_core_model, reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;

From 52a890cff520ea48d6bfa46ff7b85b5d5e06d1be Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 21:38:04 -0400
Subject: [PATCH 019/133] need at least 1 cu per sched for sub_core model, fix
 find_ready() reg_id

---
 src/gpgpu-sim/shader.cc | 9 ++++++---
 src/gpgpu-sim/shader.h  | 7 ++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 928e1083a..c1bc495fc 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3961,8 +3961,11 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
               m_shader->get_config()->warp_size);  // cu->get_active_count());
         }
       }
-      unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
-      unsigned reg_id = cu->get_id() / cusPerSched;
+      unsigned reg_id;
+      if (sub_core_model) {
+        unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
+        reg_id = cu->get_id() / cusPerSched;
+      }
       cu->dispatch(sub_core_model, reg_id);
     }
   }
@@ -3983,7 +3986,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
           // Sub core model only allocates on the subset of CUs assigned to the scheduler that issued
           unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
           schd_id = (*inp.m_in[i]).get_schd_id(reg_id);
-          assert(cu_set.size() % m_num_warp_scheds == 0);
+          assert(cu_set.size() % m_num_warp_scheds == 0 && cu_set.size() >= m_num_warp_scheds);
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
           cuLowerBound = schd_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 9b14bfdc5..0b96ec0c8 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -920,7 +920,12 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     collector_unit_t *find_ready(bool sub_core_model) {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        unsigned reg_id = c / m_num_collectors;
+        unsigned reg_id;
+        if (sub_core_model) {
+          assert (m_num_collectors >= m_num_warp_scheds);
+          unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
+          reg_id = c / cusPerSched;
+        }
         if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);

From 2db9120218c894c7d90ef833477c3e0ca5425213 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 22:15:23 -0400
Subject: [PATCH 020/133] move reg_id calc to cu object init

---
 src/gpgpu-sim/shader.cc | 19 +++++++++++++------
 src/gpgpu-sim/shader.h  | 15 ++++++---------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c1bc495fc..72476161f 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3868,14 +3868,21 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
 
   sub_core_model = shader->get_config()->sub_core_model;
   m_num_warp_scheds = shader->get_config()->gpgpu_num_sched_per_core;
-  if (sub_core_model)
+  unsigned reg_id;
+  if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
+    assert(m_num_warp_scheds >= m_cu.size() && m_cu.size() % m_num_warp_scheds == 0);
+  }
   m_num_banks_per_sched =
       num_banks / shader->get_config()->gpgpu_num_sched_per_core;
 
   for (unsigned j = 0; j < m_cu.size(); j++) {
+    if (sub_core_model) {
+      unsigned cusPerSched = m_cu.size() / m_num_warp_scheds;
+      reg_id = j / cusPerSched;
+    }
     m_cu[j]->init(j, num_banks, m_bank_warp_shift, shader->get_config(), this,
-                  sub_core_model, m_num_banks_per_sched);
+                  sub_core_model, reg_id, m_num_banks_per_sched);
   }
   m_initialized = true;
 }
@@ -3962,10 +3969,8 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
         }
       }
       unsigned reg_id;
-      if (sub_core_model) {
-        unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
-        reg_id = cu->get_id() / cusPerSched;
-      }
+      if (sub_core_model) 
+        reg_id = cu->get_reg_id();
       cu->dispatch(sub_core_model, reg_id);
     }
   }
@@ -4074,6 +4079,7 @@ void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
                                             const core_config *config,
                                             opndcoll_rfu_t *rfu,
                                             bool sub_core_model,
+                                            unsigned reg_id,
                                             unsigned banks_per_sched) {
   m_rfu = rfu;
   m_cuid = n;
@@ -4082,6 +4088,7 @@ void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
   m_warp = new warp_inst_t(config);
   m_bank_warp_shift = log2_warp_size;
   m_sub_core_model = sub_core_model;
+  m_reg_id = reg_id;
   m_num_banks_per_sched = banks_per_sched;
 }
 
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 0b96ec0c8..a5a8166e7 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -867,7 +867,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_bank_warp_shift = 0;
     }
     // accessors
-    bool ready(bool sub_core_modle, unsigned reg_id) const;
+    bool ready(bool sub_core_model, unsigned reg_id) const;
     const op_t *get_operands() const { return m_src_op; }
     void dump(FILE *fp, const shader_core_ctx *shader) const;
 
@@ -878,11 +878,12 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     }
     unsigned get_sp_op() const { return m_warp->sp_op; }
     unsigned get_id() const { return m_cuid; }  // returns CU hw id
+    unsigned get_reg_id() const { return m_reg_id; }
 
     // modifiers
     void init(unsigned n, unsigned num_banks, unsigned log2_warp_size,
               const core_config *config, opndcoll_rfu_t *rfu,
-              bool m_sub_core_model, unsigned num_banks_per_sched);
+              bool m_sub_core_model, unsigned reg_id, unsigned num_banks_per_sched);
     bool allocate(register_set *pipeline_reg, register_set *output_reg);
 
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
@@ -906,6 +907,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
 
     unsigned m_num_banks_per_sched;
     bool m_sub_core_model;
+    unsigned m_reg_id; // if sub_core_model enabled, limit regs this cu can r/w
   };
 
   class dispatch_unit_t {
@@ -921,11 +923,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
         unsigned reg_id;
-        if (sub_core_model) {
-          assert (m_num_collectors >= m_num_warp_scheds);
-          unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
-          reg_id = c / cusPerSched;
-        }
+        if (sub_core_model)
+          reg_id = (*m_collector_units)[c].get_reg_id();
         if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
@@ -934,8 +933,6 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       return NULL;
     }
 
-    unsigned get_num_collectors(){return m_num_collectors;}
-
    private:
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;

From 4825a1dad0938a40c8feb01e554ca8f5fdc6c4c5 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 22:26:47 -0400
Subject: [PATCH 021/133] fix assert

---
 src/gpgpu-sim/shader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 72476161f..acd41d868 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3871,7 +3871,7 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   unsigned reg_id;
   if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
-    assert(m_num_warp_scheds >= m_cu.size() && m_cu.size() % m_num_warp_scheds == 0);
+    assert(m_num_warp_scheds <= m_cu.size() && m_cu.size() % m_num_warp_scheds == 0);
   }
   m_num_banks_per_sched =
       num_banks / shader->get_config()->gpgpu_num_sched_per_core;

From e2b410dd117b11098e6bb88be36293afbeb5c444 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 22:45:02 -0400
Subject: [PATCH 022/133] clean up redundant method args

---
 src/gpgpu-sim/shader.cc | 13 +++++--------
 src/gpgpu-sim/shader.h  |  4 ++--
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index acd41d868..e3a3e9c11 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3968,10 +3968,7 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
               m_shader->get_config()->warp_size);  // cu->get_active_count());
         }
       }
-      unsigned reg_id;
-      if (sub_core_model) 
-        reg_id = cu->get_reg_id();
-      cu->dispatch(sub_core_model, reg_id);
+      cu->dispatch();
     }
   }
 }
@@ -4055,8 +4052,8 @@ void opndcoll_rfu_t::allocate_reads() {
   }
 }
 
-bool opndcoll_rfu_t::collector_unit_t::ready(bool sub_core_model, unsigned reg_id) const {
-  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free(sub_core_model, reg_id);
+bool opndcoll_rfu_t::collector_unit_t::ready() const {
+  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free(m_sub_core_model, m_reg_id);
 }
 
 void opndcoll_rfu_t::collector_unit_t::dump(
@@ -4121,7 +4118,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
   return false;
 }
 
-void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned reg_id) {
+void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
   // Print out which OC dispatched which warp sched id to which exec pipeline
   /* std::cout << "Dispatched from OC: "
@@ -4135,7 +4132,7 @@ void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned re
   << "\treg id: "
   << reg_id
   << std::endl; */
-  m_output_register->move_in(sub_core_model, reg_id, m_warp);
+  m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;
   for (unsigned i = 0; i < MAX_REG_OPERANDS * 2; i++) m_src_op[i].reset();
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index a5a8166e7..00e7deb05 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -867,7 +867,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_bank_warp_shift = 0;
     }
     // accessors
-    bool ready(bool sub_core_model, unsigned reg_id) const;
+    bool ready() const;
     const op_t *get_operands() const { return m_src_op; }
     void dump(FILE *fp, const shader_core_ctx *shader) const;
 
@@ -889,7 +889,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
     unsigned get_num_operands() const { return m_warp->get_num_operands(); }
     unsigned get_num_regs() const { return m_warp->get_num_regs(); }
-    void dispatch(bool sub_core_model, unsigned reg_id);
+    void dispatch();
     bool is_free() { return m_free; }
 
    private:

From 9c0156bd732fe370d5022ca036fff515fcd9d2d4 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 22:58:05 -0400
Subject: [PATCH 023/133] more cleanup

---
 src/gpgpu-sim/shader.cc | 4 ++--
 src/gpgpu-sim/shader.h  | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index e3a3e9c11..9eab7fcad 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3996,7 +3996,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            //std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
+            // std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);
@@ -4130,7 +4130,7 @@ void opndcoll_rfu_t::collector_unit_t::dispatch() {
   << "\tto execution register: "
   << m_output_register->get_name()
   << "\treg id: "
-  << reg_id
+  << this->get_reg_id()
   << std::endl; */
   m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 00e7deb05..7655cb9e6 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -922,10 +922,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     collector_unit_t *find_ready(bool sub_core_model) {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        unsigned reg_id;
-        if (sub_core_model)
-          reg_id = (*m_collector_units)[c].get_reg_id();
-        if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
+        if ((*m_collector_units)[c].ready()) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
         }

From 28c3c94e4e76f5c2a9fffb557587c6be3b541ccf Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 23:02:17 -0400
Subject: [PATCH 024/133] cleanup find_ready

---
 src/gpgpu-sim/shader.cc | 2 +-
 src/gpgpu-sim/shader.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 9eab7fcad..db24d8c52 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3946,7 +3946,7 @@ bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
 void opndcoll_rfu_t::dispatch_ready_cu() {
   for (unsigned p = 0; p < m_dispatch_units.size(); ++p) {
     dispatch_unit_t &du = m_dispatch_units[p];
-    collector_unit_t *cu = du.find_ready(sub_core_model);
+    collector_unit_t *cu = du.find_ready();
     if (cu) {
       for (unsigned i = 0; i < (cu->get_num_operands() - cu->get_num_regs());
            i++) {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 7655cb9e6..75734e476 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -919,7 +919,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_next_cu = 0;
     }
 
-    collector_unit_t *find_ready(bool sub_core_model) {
+    collector_unit_t *find_ready() {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
         if ((*m_collector_units)[c].ready()) {

From 28d056519c7f1771557f90d5b0b295b7f75c1a2d Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 18:13:37 -0400
Subject: [PATCH 025/133] partition issue() in the shader execute stage

---
 src/abstract_hardware_model.h | 16 ++++++++
 src/gpgpu-sim/shader.cc       | 72 +++++++++++++++++++----------------
 src/gpgpu-sim/shader.h        | 26 +++++++++----
 3 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index d70c3ebc3..90ae44896 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1316,6 +1316,12 @@ class register_set {
     }
     return false;
   }
+  bool has_ready(bool sub_core_model, unsigned reg_id) {
+    if (!sub_core_model) return has_ready();
+    assert(reg_id < regs.size());
+    return (not regs[reg_id]->empty())
+  }
+
   unsigned get_ready_reg_id() {
     // for sub core model we need to figure which reg_id has the ready warp
     // this function should only be called if has_ready() was true
@@ -1376,6 +1382,16 @@ class register_set {
     }
     return ready;
   }
+  warp_inst_t **get_ready(bool sub_core_model, unsigned reg_id) {
+    if (!sub_core_model)
+      return get_ready();
+    warp_inst_t **ready;
+    ready = NULL;
+    assert(reg_id < regs.size());
+    if (not regs[reg_id]->empty)
+      ready = &regs[reg_id];
+    return ready;
+  }
 
   void print(FILE *fp) const {
     fprintf(fp, "%s : @%p\n", m_name, this);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index e3a3e9c11..ca421deba 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -377,41 +377,41 @@ void shader_core_ctx::create_exec_pipeline() {
 
   // m_fu = new simd_function_unit*[m_num_function_units];
 
-  for (int k = 0; k < m_config->gpgpu_num_sp_units; k++) {
-    m_fu.push_back(new sp_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_sp_units; k++) {
+    m_fu.push_back(new sp_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_SP);
     m_issue_port.push_back(OC_EX_SP);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_dp_units; k++) {
-    m_fu.push_back(new dp_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_dp_units; k++) {
+    m_fu.push_back(new dp_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_DP);
     m_issue_port.push_back(OC_EX_DP);
   }
-  for (int k = 0; k < m_config->gpgpu_num_int_units; k++) {
-    m_fu.push_back(new int_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_int_units; k++) {
+    m_fu.push_back(new int_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_INT);
     m_issue_port.push_back(OC_EX_INT);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_sfu_units; k++) {
-    m_fu.push_back(new sfu(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_sfu_units; k++) {
+    m_fu.push_back(new sfu(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_SFU);
     m_issue_port.push_back(OC_EX_SFU);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_tensor_core_units; k++) {
-    m_fu.push_back(new tensor_core(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_tensor_core_units; k++) {
+    m_fu.push_back(new tensor_core(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_TENSOR_CORE);
     m_issue_port.push_back(OC_EX_TENSOR_CORE);
   }
 
-  for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+  for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
     for (unsigned k = 0; k < m_config->m_specialized_unit[j].num_units; k++) {
       m_fu.push_back(new specialized_unit(
           &m_pipeline_reg[EX_WB], m_config, this, SPEC_UNIT_START_ID + j,
           m_config->m_specialized_unit[j].name,
-          m_config->m_specialized_unit[j].latency));
+          m_config->m_specialized_unit[j].latency, k));
       m_dispatch_port.push_back(m_config->m_specialized_unit[j].ID_OC_SPEC_ID);
       m_issue_port.push_back(m_config->m_specialized_unit[j].OC_EX_SPEC_ID);
     }
@@ -419,7 +419,7 @@ void shader_core_ctx::create_exec_pipeline() {
 
   m_ldst_unit = new ldst_unit(m_icnt, m_mem_fetch_allocator, this,
                               &m_operand_collector, m_scoreboard, m_config,
-                              m_memory_config, m_stats, m_sid, m_tpc);
+                              m_memory_config, m_stats, m_sid, m_tpc, static_cast<unsigned>(0));
   m_fu.push_back(m_ldst_unit);
   m_dispatch_port.push_back(ID_OC_MEM);
   m_issue_port.push_back(OC_EX_MEM);
@@ -1669,8 +1669,13 @@ void shader_core_ctx::execute() {
     m_fu[n]->active_lanes_in_pipeline();
     unsigned issue_port = m_issue_port[n];
     register_set &issue_inst = m_pipeline_reg[issue_port];
-    warp_inst_t **ready_reg = issue_inst.get_ready();
-    if (issue_inst.has_ready() && m_fu[n]->can_issue(**ready_reg)) {
+    unsigned reg_id;
+    bool partition_issue = m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
+    if (m_config->sub_core_model) {
+      reg_id = m_fu[n]->get_issue_reg_id();
+    }
+    warp_inst_t **ready_reg = issue_inst.get_ready(partition_issue, reg_id);
+    if (issue_inst.has_ready(partition_issue, reg_id) && m_fu[n]->can_issue(**ready_reg)) {
       bool schedule_wb_now = !m_fu[n]->stallable();
       int resbus = -1;
       if (schedule_wb_now &&
@@ -2113,16 +2118,17 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
 }
 
 sfu::sfu(register_set *result_port, const shader_core_config *config,
-         shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core) {
+         shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core,
+     issue_reg_id) {
   m_name = "SFU";
 }
 
 tensor_core::tensor_core(register_set *result_port,
                          const shader_core_config *config,
-                         shader_core_ctx *core)
+                         shader_core_ctx *core, unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, config->max_tensor_core_latency,
-                          core) {
+                          core, issue_reg_id) {
   m_name = "TENSOR_CORE";
 }
 
@@ -2208,29 +2214,29 @@ void tensor_core::active_lanes_in_pipeline() {
 }
 
 sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
-                 shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core) {
+                 shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core, issue_reg_id) {
   m_name = "SP ";
 }
 
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency)
-    : pipelined_simd_unit(result_port, config, latency, core) {
+                                   char *unit_name, unsigned latency, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
   m_name = unit_name;
   m_supported_op = supported_op;
 }
 
 dp_unit::dp_unit(register_set *result_port, const shader_core_config *config,
-                 shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core) {
+                 shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core, issue_reg_id) {
   m_name = "DP ";
 }
 
 int_unit::int_unit(register_set *result_port, const shader_core_config *config,
-                   shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_int_latency, core) {
+                   shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_int_latency, core, issue_reg_id) {
   m_name = "INT ";
 }
 
@@ -2269,7 +2275,8 @@ void int_unit ::issue(register_set &source_reg) {
 pipelined_simd_unit::pipelined_simd_unit(register_set *result_port,
                                          const shader_core_config *config,
                                          unsigned max_latency,
-                                         shader_core_ctx *core)
+                                         shader_core_ctx *core,
+                                         unsigned issue_reg_id)
     : simd_function_unit(config) {
   m_result_port = result_port;
   m_pipeline_depth = max_latency;
@@ -2277,6 +2284,7 @@ pipelined_simd_unit::pipelined_simd_unit(register_set *result_port,
   for (unsigned i = 0; i < m_pipeline_depth; i++)
     m_pipeline_reg[i] = new warp_inst_t(config);
   m_core = core;
+  m_issue_reg_id = issue_reg_id;
   active_insts_in_pipeline = 0;
 }
 
@@ -2359,8 +2367,8 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc)
-    : pipelined_simd_unit(NULL, config, config->smem_latency, core),
+                     unsigned sid, unsigned tpc, unsigned issue_reg_id)
+    : pipelined_simd_unit(NULL, config, config->smem_latency, core, issue_reg_id),
       m_next_wb(config) {
   assert(config->smem_latency > 1);
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
@@ -2387,8 +2395,8 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache)
-    : pipelined_simd_unit(NULL, config, 3, core),
+                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache, unsigned issue_reg_id)
+    : pipelined_simd_unit(NULL, config, 3, core, issue_reg_id),
       m_L1D(new_l1d_cache),
       m_next_wb(config) {
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 00e7deb05..ba37b0cee 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1075,7 +1075,7 @@ class pipelined_simd_unit : public simd_function_unit {
  public:
   pipelined_simd_unit(register_set *result_port,
                       const shader_core_config *config, unsigned max_latency,
-                      shader_core_ctx *core);
+                      shader_core_ctx *core, unsigned issue_reg_id);
 
   // modifiers
   virtual void cycle();
@@ -1096,6 +1096,7 @@ class pipelined_simd_unit : public simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return simd_function_unit::can_issue(inst);
   }
+  unsigned get_issue_reg_id() { return m_issue_reg_id; }
   virtual void print(FILE *fp) const {
     simd_function_unit::print(fp);
     for (int s = m_pipeline_depth - 1; s >= 0; s--) {
@@ -1111,6 +1112,8 @@ class pipelined_simd_unit : public simd_function_unit {
   warp_inst_t **m_pipeline_reg;
   register_set *m_result_port;
   class shader_core_ctx *m_core;
+  unsigned m_issue_reg_id; // if sub_core_model is enabled we can only issue from a
+                           // subset of operand collectors
 
   unsigned active_insts_in_pipeline;
 };
@@ -1118,7 +1121,7 @@ class pipelined_simd_unit : public simd_function_unit {
 class sfu : public pipelined_simd_unit {
  public:
   sfu(register_set *result_port, const shader_core_config *config,
-      shader_core_ctx *core);
+      shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1134,12 +1137,13 @@ class sfu : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class dp_unit : public pipelined_simd_unit {
  public:
   dp_unit(register_set *result_port, const shader_core_config *config,
-          shader_core_ctx *core);
+          shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case DP_OP:
@@ -1151,12 +1155,13 @@ class dp_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class tensor_core : public pipelined_simd_unit {
  public:
   tensor_core(register_set *result_port, const shader_core_config *config,
-              shader_core_ctx *core);
+              shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case TENSOR_CORE_OP:
@@ -1168,12 +1173,13 @@ class tensor_core : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class int_unit : public pipelined_simd_unit {
  public:
   int_unit(register_set *result_port, const shader_core_config *config,
-           shader_core_ctx *core);
+           shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1199,12 +1205,13 @@ class int_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class sp_unit : public pipelined_simd_unit {
  public:
   sp_unit(register_set *result_port, const shader_core_config *config,
-          shader_core_ctx *core);
+          shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1228,13 +1235,14 @@ class sp_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned supported_op,
-                   char *unit_name, unsigned latency);
+                   char *unit_name, unsigned latency, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1243,6 +1251,7 @@ class specialized_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return false; }
 
  private:
   unsigned m_supported_op;
@@ -1260,10 +1269,11 @@ class ldst_unit : public pipelined_simd_unit {
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, class shader_core_stats *stats,
-            unsigned sid, unsigned tpc);
+            unsigned sid, unsigned tpc, unsigned issue_reg_id);
 
   // modifiers
   virtual void issue(register_set &inst);
+  bool is_issue_partitioned() { return false; }
   virtual void cycle();
 
   void fill(mem_fetch *mf);

From ec55c68bcdf4406743efa591fcb30e4f467012a0 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 19:30:09 -0400
Subject: [PATCH 026/133] minor fixes, pure virtual calls

---
 src/abstract_hardware_model.h |  4 ++--
 src/gpgpu-sim/shader.cc       | 16 ++++++++--------
 src/gpgpu-sim/shader.h        |  7 +++++--
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 90ae44896..6d431fc60 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1319,7 +1319,7 @@ class register_set {
   bool has_ready(bool sub_core_model, unsigned reg_id) {
     if (!sub_core_model) return has_ready();
     assert(reg_id < regs.size());
-    return (not regs[reg_id]->empty())
+    return (not regs[reg_id]->empty());
   }
 
   unsigned get_ready_reg_id() {
@@ -1388,7 +1388,7 @@ class register_set {
     warp_inst_t **ready;
     ready = NULL;
     assert(reg_id < regs.size());
-    if (not regs[reg_id]->empty)
+    if (not regs[reg_id]->empty())
       ready = &regs[reg_id];
     return ready;
   }
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 17cf5ba26..d98f10a95 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -411,7 +411,7 @@ void shader_core_ctx::create_exec_pipeline() {
       m_fu.push_back(new specialized_unit(
           &m_pipeline_reg[EX_WB], m_config, this, SPEC_UNIT_START_ID + j,
           m_config->m_specialized_unit[j].name,
-          m_config->m_specialized_unit[j].latency, k));
+          m_config->m_specialized_unit[j].latency));
       m_dispatch_port.push_back(m_config->m_specialized_unit[j].ID_OC_SPEC_ID);
       m_issue_port.push_back(m_config->m_specialized_unit[j].OC_EX_SPEC_ID);
     }
@@ -419,7 +419,7 @@ void shader_core_ctx::create_exec_pipeline() {
 
   m_ldst_unit = new ldst_unit(m_icnt, m_mem_fetch_allocator, this,
                               &m_operand_collector, m_scoreboard, m_config,
-                              m_memory_config, m_stats, m_sid, m_tpc, static_cast<unsigned>(0));
+                              m_memory_config, m_stats, m_sid, m_tpc);
   m_fu.push_back(m_ldst_unit);
   m_dispatch_port.push_back(ID_OC_MEM);
   m_issue_port.push_back(OC_EX_MEM);
@@ -2222,8 +2222,8 @@ sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency, unsigned issue_reg_id)
-    : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
+                                   char *unit_name, unsigned latency)
+    : pipelined_simd_unit(result_port, config, latency, core, 0) {
   m_name = unit_name;
   m_supported_op = supported_op;
 }
@@ -2367,8 +2367,8 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc, unsigned issue_reg_id)
-    : pipelined_simd_unit(NULL, config, config->smem_latency, core, issue_reg_id),
+                     unsigned sid, unsigned tpc)
+    : pipelined_simd_unit(NULL, config, config->smem_latency, core, 0),
       m_next_wb(config) {
   assert(config->smem_latency > 1);
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
@@ -2395,8 +2395,8 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache, unsigned issue_reg_id)
-    : pipelined_simd_unit(NULL, config, 3, core, issue_reg_id),
+                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache)
+    : pipelined_simd_unit(NULL, config, 3, core, 0),
       m_L1D(new_l1d_cache),
       m_next_wb(config) {
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 5c5e9a46b..62abd35ab 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1053,6 +1053,8 @@ class simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return m_dispatch_reg->empty() && !occupied.test(inst.latency);
   }
+  virtual bool is_issue_partitioned() = 0;
+  virtual unsigned get_issue_reg_id() = 0; 
   virtual bool stallable() const = 0;
   virtual void print(FILE *fp) const {
     fprintf(fp, "%s dispatch= ", m_name.c_str());
@@ -1093,6 +1095,7 @@ class pipelined_simd_unit : public simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return simd_function_unit::can_issue(inst);
   }
+  virtual bool is_issue_partitioned() = 0;
   unsigned get_issue_reg_id() { return m_issue_reg_id; }
   virtual void print(FILE *fp) const {
     simd_function_unit::print(fp);
@@ -1239,7 +1242,7 @@ class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned supported_op,
-                   char *unit_name, unsigned latency, unsigned issue_reg_id);
+                   char *unit_name, unsigned latency);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1266,7 +1269,7 @@ class ldst_unit : public pipelined_simd_unit {
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, class shader_core_stats *stats,
-            unsigned sid, unsigned tpc, unsigned issue_reg_id);
+            unsigned sid, unsigned tpc);
 
   // modifiers
   virtual void issue(register_set &inst);

From 71455d84455f4a75bb2763ebe2fd58617a4ad843 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 20:07:08 -0400
Subject: [PATCH 027/133] add prints for ex issue validation

---
 src/gpgpu-sim/shader.cc | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index d98f10a95..f838ba118 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1683,9 +1683,28 @@ void shader_core_ctx::execute() {
         assert((*ready_reg)->latency < MAX_ALU_LATENCY);
         m_result_bus[resbus]->set((*ready_reg)->latency);
         m_fu[n]->issue(issue_inst);
+        warp_inst_t** instr = issue_inst.get_ready(true, reg_id);
+        std::cout << "EX stage issued warp_id: "
+        << (*instr)->warp_id()
+        << " schd_id: "
+        << (*instr)->get_schd_id()
+        << " to pipeline: "
+        << m_fu[n]->get_name()
+        << " issue reg_id: "
+        << m_fu[n]->get_issue_reg_id()
+        << std::endl;
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
-      } else {
+        std::cout << "EX stage issued warp_id: "
+        << (*instr)->warp_id()
+        << " schd_id: "
+        << (*instr)->get_schd_id()
+        << " to pipeline: "
+        << m_fu[n]->get_name()
+        << " issue reg_id: "
+        << m_fu[n]->get_issue_reg_id()
+        << std::endl;
+        } else {
         // stall issue (cannot reserve result bus)
       }
     }
@@ -4004,7 +4023,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            // std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
+            std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);
@@ -4129,7 +4148,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
   // Print out which OC dispatched which warp sched id to which exec pipeline
-  /* std::cout << "Dispatched from OC: "
+  std::cout << "Dispatched from OC: "
   << this->get_id()
   << "\t Warp_id: "
   << m_warp->get_uid()
@@ -4139,7 +4158,7 @@ void opndcoll_rfu_t::collector_unit_t::dispatch() {
   << m_output_register->get_name()
   << "\treg id: "
   << this->get_reg_id()
-  << std::endl; */
+  << std::endl;
   m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;

From 640674b74b12ef4b0188b267884eda9391f4bf34 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 20:25:49 -0400
Subject: [PATCH 028/133] issue function needed to be constrained

---
 src/abstract_hardware_model.h |  5 +++++
 src/gpgpu-sim/shader.cc       | 12 ++++++------
 src/gpgpu-sim/shader.h        |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 6d431fc60..e9da4294e 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1367,6 +1367,11 @@ class register_set {
     warp_inst_t **ready = get_ready();
     move_warp(dest, *ready);
   }
+  void move_out_to(bool sub_core_model, unsigned reg_id, warp_inst_t *&dest) {
+    if (!sub_core_model) { return move_out_to(dest);}
+    warp_inst_t **ready = get_ready(sub_core_model, reg_id);
+    move_warp(dest, *ready);
+  }
 
   warp_inst_t **get_ready() {
     warp_inst_t **ready;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index f838ba118..659d1590f 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -2152,7 +2152,7 @@ tensor_core::tensor_core(register_set *result_port,
 }
 
 void sfu::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = SFU__OP;
@@ -2161,7 +2161,7 @@ void sfu::issue(register_set &source_reg) {
 }
 
 void tensor_core::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = TENSOR_CORE__OP;
@@ -2260,7 +2260,7 @@ int_unit::int_unit(register_set *result_port, const shader_core_config *config,
 }
 
 void sp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2268,7 +2268,7 @@ void sp_unit ::issue(register_set &source_reg) {
 }
 
 void dp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = DP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2284,7 +2284,7 @@ void specialized_unit ::issue(register_set &source_reg) {
 }
 
 void int_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = INTP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2330,7 +2330,7 @@ void pipelined_simd_unit::cycle() {
 
 void pipelined_simd_unit::issue(register_set &source_reg) {
   // move_warp(m_dispatch_reg,source_reg);
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   m_core->incexecstat((*ready_reg));
   // source_reg.move_out_to(m_dispatch_reg);
   simd_function_unit::issue(source_reg);
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 62abd35ab..2b0c71041 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1042,7 +1042,7 @@ class simd_function_unit {
 
   // modifiers
   virtual void issue(register_set &source_reg) {
-    source_reg.move_out_to(m_dispatch_reg);
+    source_reg.move_out_to(m_config->sub_core_model, this->get_issue_reg_id(), m_dispatch_reg);
     occupied.set(m_dispatch_reg->latency);
   }
   virtual void cycle() = 0;

From 9b6af844b8adc5d15bd793646c18a7b1d9593890 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 20:35:49 -0400
Subject: [PATCH 029/133] fix print, move simd::issue() impl to .cc file

---
 src/gpgpu-sim/shader.cc | 6 ++++++
 src/gpgpu-sim/shader.h  | 5 +----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 659d1590f..349f95462 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1695,6 +1695,7 @@ void shader_core_ctx::execute() {
         << std::endl;
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
+        warp_inst_t** instr = issue_inst.get_ready(true, reg_id);
         std::cout << "EX stage issued warp_id: "
         << (*instr)->warp_id()
         << " schd_id: "
@@ -2136,6 +2137,11 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
   m_dispatch_reg = new warp_inst_t(config);
 }
 
+void simd_function_unit::issue(register_set &source_reg) {
+    source_reg.move_out_to(m_config->sub_core_model, this->get_issue_reg_id(), m_dispatch_reg);
+    occupied.set(m_dispatch_reg->latency);
+  }
+
 sfu::sfu(register_set *result_port, const shader_core_config *config,
          shader_core_ctx *core, unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core,
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 2b0c71041..7987427d1 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1041,10 +1041,7 @@ class simd_function_unit {
   ~simd_function_unit() { delete m_dispatch_reg; }
 
   // modifiers
-  virtual void issue(register_set &source_reg) {
-    source_reg.move_out_to(m_config->sub_core_model, this->get_issue_reg_id(), m_dispatch_reg);
-    occupied.set(m_dispatch_reg->latency);
-  }
+  virtual void issue(register_set &source_reg);
   virtual void cycle() = 0;
   virtual void active_lanes_in_pipeline() = 0;
 

From 6ae23912133b158670343da08469747cefef97d1 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Wed, 12 May 2021 12:53:36 -0400
Subject: [PATCH 030/133] fix prints / segfault

---
 src/abstract_hardware_model.h |  1 +
 src/gpgpu-sim/shader.cc       | 32 +++++++++-----------------------
 2 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index e9da4294e..129ed69d9 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1370,6 +1370,7 @@ class register_set {
   void move_out_to(bool sub_core_model, unsigned reg_id, warp_inst_t *&dest) {
     if (!sub_core_model) { return move_out_to(dest);}
     warp_inst_t **ready = get_ready(sub_core_model, reg_id);
+    assert(ready != NULL);
     move_warp(dest, *ready);
   }
 
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 349f95462..8816959f6 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1671,7 +1671,7 @@ void shader_core_ctx::execute() {
     register_set &issue_inst = m_pipeline_reg[issue_port];
     unsigned reg_id;
     bool partition_issue = m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
-    if (m_config->sub_core_model) {
+    if (partition_issue) {
       reg_id = m_fu[n]->get_issue_reg_id();
     }
     warp_inst_t **ready_reg = issue_inst.get_ready(partition_issue, reg_id);
@@ -1683,28 +1683,10 @@ void shader_core_ctx::execute() {
         assert((*ready_reg)->latency < MAX_ALU_LATENCY);
         m_result_bus[resbus]->set((*ready_reg)->latency);
         m_fu[n]->issue(issue_inst);
-        warp_inst_t** instr = issue_inst.get_ready(true, reg_id);
-        std::cout << "EX stage issued warp_id: "
-        << (*instr)->warp_id()
-        << " schd_id: "
-        << (*instr)->get_schd_id()
-        << " to pipeline: "
-        << m_fu[n]->get_name()
-        << " issue reg_id: "
-        << m_fu[n]->get_issue_reg_id()
-        << std::endl;
+        warp_inst_t** instr = issue_inst.get_ready(partition_issue, reg_id);
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
-        warp_inst_t** instr = issue_inst.get_ready(true, reg_id);
-        std::cout << "EX stage issued warp_id: "
-        << (*instr)->warp_id()
-        << " schd_id: "
-        << (*instr)->get_schd_id()
-        << " to pipeline: "
-        << m_fu[n]->get_name()
-        << " issue reg_id: "
-        << m_fu[n]->get_issue_reg_id()
-        << std::endl;
+        warp_inst_t** instr = issue_inst.get_ready(partition_issue, reg_id);
         } else {
         // stall issue (cannot reserve result bus)
       }
@@ -2138,7 +2120,10 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
 }
 
 void simd_function_unit::issue(register_set &source_reg) {
-    source_reg.move_out_to(m_config->sub_core_model, this->get_issue_reg_id(), m_dispatch_reg);
+    bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
+    source_reg.move_out_to(partition_issue, this->get_issue_reg_id(), m_dispatch_reg);
+    std::cout << "EX stage issue stats:" << std::endl;
+    this->print(stdout);
     occupied.set(m_dispatch_reg->latency);
   }
 
@@ -2336,7 +2321,8 @@ void pipelined_simd_unit::cycle() {
 
 void pipelined_simd_unit::issue(register_set &source_reg) {
   // move_warp(m_dispatch_reg,source_reg);
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
+  warp_inst_t **ready_reg = source_reg.get_ready(partition_issue, m_issue_reg_id);
   m_core->incexecstat((*ready_reg));
   // source_reg.move_out_to(m_dispatch_reg);
   simd_function_unit::issue(source_reg);

From a450d74a66ed7c58aef66ea28f358230ac614f3d Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Wed, 12 May 2021 12:56:56 -0400
Subject: [PATCH 031/133] remove prints

---
 src/gpgpu-sim/shader.cc | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 8816959f6..d978e6cf4 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -2122,8 +2122,6 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
 void simd_function_unit::issue(register_set &source_reg) {
     bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
     source_reg.move_out_to(partition_issue, this->get_issue_reg_id(), m_dispatch_reg);
-    std::cout << "EX stage issue stats:" << std::endl;
-    this->print(stdout);
     occupied.set(m_dispatch_reg->latency);
   }
 
@@ -4015,7 +4013,6 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);
@@ -4139,18 +4136,6 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 
 void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
-  // Print out which OC dispatched which warp sched id to which exec pipeline
-  std::cout << "Dispatched from OC: "
-  << this->get_id()
-  << "\t Warp_id: "
-  << m_warp->get_uid()
-  << "\t Sched_id: "
-  << m_warp->get_schd_id()
-  << "\tto execution register: "
-  << m_output_register->get_name()
-  << "\treg id: "
-  << this->get_reg_id()
-  << std::endl;
   m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;

From 6a09900b34d2eaf5397fd24a5892bf09062be732 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Wed, 12 May 2021 15:36:37 -0400
Subject: [PATCH 032/133] rm unnecessary instr get

---
 src/gpgpu-sim/shader.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index d978e6cf4..c72ed95db 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1683,10 +1683,8 @@ void shader_core_ctx::execute() {
         assert((*ready_reg)->latency < MAX_ALU_LATENCY);
         m_result_bus[resbus]->set((*ready_reg)->latency);
         m_fu[n]->issue(issue_inst);
-        warp_inst_t** instr = issue_inst.get_ready(partition_issue, reg_id);
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
-        warp_inst_t** instr = issue_inst.get_ready(partition_issue, reg_id);
         } else {
         // stall issue (cannot reserve result bus)
       }

From 5945d709530cc1419f624ffb048739f2b70ee1b9 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Thu, 13 May 2021 10:42:38 -0400
Subject: [PATCH 033/133] specialized unit should be partitioned too

---
 src/gpgpu-sim/shader.cc | 6 +++---
 src/gpgpu-sim/shader.h  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c72ed95db..3059b517f 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -411,7 +411,7 @@ void shader_core_ctx::create_exec_pipeline() {
       m_fu.push_back(new specialized_unit(
           &m_pipeline_reg[EX_WB], m_config, this, SPEC_UNIT_START_ID + j,
           m_config->m_specialized_unit[j].name,
-          m_config->m_specialized_unit[j].latency));
+          m_config->m_specialized_unit[j].latency, k));
       m_dispatch_port.push_back(m_config->m_specialized_unit[j].ID_OC_SPEC_ID);
       m_issue_port.push_back(m_config->m_specialized_unit[j].OC_EX_SPEC_ID);
     }
@@ -2228,8 +2228,8 @@ sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency)
-    : pipelined_simd_unit(result_port, config, latency, core, 0) {
+                                   char *unit_name, unsigned latency, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
   m_name = unit_name;
   m_supported_op = supported_op;
 }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 7987427d1..fa71af36c 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1239,7 +1239,7 @@ class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned supported_op,
-                   char *unit_name, unsigned latency);
+                   char *unit_name, unsigned latency, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1248,7 +1248,7 @@ class specialized_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
-  bool is_issue_partitioned() { return false; }
+  bool is_issue_partitioned() { return true; }
 
  private:
   unsigned m_supported_op;

From 92c814a49dc98e282a46031543d289426dc04b00 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Thu, 13 May 2021 10:54:41 -0400
Subject: [PATCH 034/133] run changes through clang-format

---
 src/abstract_hardware_model.h |  32 +++++------
 src/gpgpu-sim/shader.cc       | 104 +++++++++++++++++++---------------
 src/gpgpu-sim/shader.h        |  20 ++++---
 3 files changed, 87 insertions(+), 69 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 129ed69d9..982e41606 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1291,7 +1291,7 @@ class register_set {
     }
     m_name = name;
   }
-  const char * get_name() {return m_name;}
+  const char *get_name() { return m_name; }
   bool has_free() {
     for (unsigned i = 0; i < regs.size(); i++) {
       if (regs[i]->empty()) {
@@ -1342,8 +1342,8 @@ class register_set {
     return reg_id;
   }
   unsigned get_schd_id(unsigned reg_id) {
-      assert(not regs[reg_id]->empty());
-      return regs[reg_id]->get_schd_id();
+    assert(not regs[reg_id]->empty());
+    return regs[reg_id]->get_schd_id();
   }
   void move_in(warp_inst_t *&src) {
     warp_inst_t **free = get_free();
@@ -1353,14 +1353,14 @@ class register_set {
   //   src->copy_contents_to(*get_free());
   //}
   void move_in(bool sub_core_model, unsigned reg_id, warp_inst_t *&src) {
-  warp_inst_t **free;
-  if (!sub_core_model) {
-    free = get_free();
-  } else {
-    assert(reg_id < regs.size());
-    free = get_free(sub_core_model, reg_id);
-  }
-  move_warp(*free, src);
+    warp_inst_t **free;
+    if (!sub_core_model) {
+      free = get_free();
+    } else {
+      assert(reg_id < regs.size());
+      free = get_free(sub_core_model, reg_id);
+    }
+    move_warp(*free, src);
   }
 
   void move_out_to(warp_inst_t *&dest) {
@@ -1368,7 +1368,9 @@ class register_set {
     move_warp(dest, *ready);
   }
   void move_out_to(bool sub_core_model, unsigned reg_id, warp_inst_t *&dest) {
-    if (!sub_core_model) { return move_out_to(dest);}
+    if (!sub_core_model) {
+      return move_out_to(dest);
+    }
     warp_inst_t **ready = get_ready(sub_core_model, reg_id);
     assert(ready != NULL);
     move_warp(dest, *ready);
@@ -1389,13 +1391,11 @@ class register_set {
     return ready;
   }
   warp_inst_t **get_ready(bool sub_core_model, unsigned reg_id) {
-    if (!sub_core_model)
-      return get_ready();
+    if (!sub_core_model) return get_ready();
     warp_inst_t **ready;
     ready = NULL;
     assert(reg_id < regs.size());
-    if (not regs[reg_id]->empty())
-      ready = &regs[reg_id];
+    if (not regs[reg_id]->empty()) ready = &regs[reg_id];
     return ready;
   }
 
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 3059b517f..e84e38d92 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -166,18 +166,15 @@ void shader_core_ctx::create_schedulers() {
   // must currently occur after all inputs have been initialized.
   std::string sched_config = m_config->gpgpu_scheduler_string;
   const concrete_scheduler scheduler =
-      sched_config.find("lrr") != std::string::npos
-          ? CONCRETE_SCHEDULER_LRR
-          : sched_config.find("two_level_active") != std::string::npos
-                ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
-                : sched_config.find("gto") != std::string::npos
-                      ? CONCRETE_SCHEDULER_GTO
-                      : sched_config.find("old") != std::string::npos
-                            ? CONCRETE_SCHEDULER_OLDEST_FIRST
-                            : sched_config.find("warp_limiting") !=
-                                      std::string::npos
-                                  ? CONCRETE_SCHEDULER_WARP_LIMITING
-                                  : NUM_CONCRETE_SCHEDULERS;
+      sched_config.find("lrr") != std::string::npos ? CONCRETE_SCHEDULER_LRR
+      : sched_config.find("two_level_active") != std::string::npos
+          ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
+      : sched_config.find("gto") != std::string::npos ? CONCRETE_SCHEDULER_GTO
+      : sched_config.find("old") != std::string::npos
+          ? CONCRETE_SCHEDULER_OLDEST_FIRST
+      : sched_config.find("warp_limiting") != std::string::npos
+          ? CONCRETE_SCHEDULER_WARP_LIMITING
+          : NUM_CONCRETE_SCHEDULERS;
   assert(scheduler != NUM_CONCRETE_SCHEDULERS);
 
   for (unsigned i = 0; i < m_config->gpgpu_num_sched_per_core; i++) {
@@ -1670,12 +1667,14 @@ void shader_core_ctx::execute() {
     unsigned issue_port = m_issue_port[n];
     register_set &issue_inst = m_pipeline_reg[issue_port];
     unsigned reg_id;
-    bool partition_issue = m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
+    bool partition_issue =
+        m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
     if (partition_issue) {
       reg_id = m_fu[n]->get_issue_reg_id();
     }
     warp_inst_t **ready_reg = issue_inst.get_ready(partition_issue, reg_id);
-    if (issue_inst.has_ready(partition_issue, reg_id) && m_fu[n]->can_issue(**ready_reg)) {
+    if (issue_inst.has_ready(partition_issue, reg_id) &&
+        m_fu[n]->can_issue(**ready_reg)) {
       bool schedule_wb_now = !m_fu[n]->stallable();
       int resbus = -1;
       if (schedule_wb_now &&
@@ -1685,7 +1684,7 @@ void shader_core_ctx::execute() {
         m_fu[n]->issue(issue_inst);
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
-        } else {
+      } else {
         // stall issue (cannot reserve result bus)
       }
     }
@@ -2118,15 +2117,17 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
 }
 
 void simd_function_unit::issue(register_set &source_reg) {
-    bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
-    source_reg.move_out_to(partition_issue, this->get_issue_reg_id(), m_dispatch_reg);
-    occupied.set(m_dispatch_reg->latency);
-  }
+  bool partition_issue =
+      m_config->sub_core_model && this->is_issue_partitioned();
+  source_reg.move_out_to(partition_issue, this->get_issue_reg_id(),
+                         m_dispatch_reg);
+  occupied.set(m_dispatch_reg->latency);
+}
 
 sfu::sfu(register_set *result_port, const shader_core_config *config,
          shader_core_ctx *core, unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core,
-     issue_reg_id) {
+                          issue_reg_id) {
   m_name = "SFU";
 }
 
@@ -2139,7 +2140,8 @@ tensor_core::tensor_core(register_set *result_port,
 }
 
 void sfu::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = SFU__OP;
@@ -2148,7 +2150,8 @@ void sfu::issue(register_set &source_reg) {
 }
 
 void tensor_core::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = TENSOR_CORE__OP;
@@ -2221,14 +2224,16 @@ void tensor_core::active_lanes_in_pipeline() {
 
 sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
                  shader_core_ctx *core, unsigned issue_reg_id)
-    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core, issue_reg_id) {
+    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core,
+                          issue_reg_id) {
   m_name = "SP ";
 }
 
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency, unsigned issue_reg_id)
+                                   char *unit_name, unsigned latency,
+                                   unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
   m_name = unit_name;
   m_supported_op = supported_op;
@@ -2236,18 +2241,21 @@ specialized_unit::specialized_unit(register_set *result_port,
 
 dp_unit::dp_unit(register_set *result_port, const shader_core_config *config,
                  shader_core_ctx *core, unsigned issue_reg_id)
-    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core, issue_reg_id) {
+    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core,
+                          issue_reg_id) {
   m_name = "DP ";
 }
 
 int_unit::int_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned issue_reg_id)
-    : pipelined_simd_unit(result_port, config, config->max_int_latency, core, issue_reg_id) {
+    : pipelined_simd_unit(result_port, config, config->max_int_latency, core,
+                          issue_reg_id) {
   m_name = "INT ";
 }
 
 void sp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2255,7 +2263,8 @@ void sp_unit ::issue(register_set &source_reg) {
 }
 
 void dp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = DP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2271,7 +2280,8 @@ void specialized_unit ::issue(register_set &source_reg) {
 }
 
 void int_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = INTP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2317,8 +2327,10 @@ void pipelined_simd_unit::cycle() {
 
 void pipelined_simd_unit::issue(register_set &source_reg) {
   // move_warp(m_dispatch_reg,source_reg);
-  bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
-  warp_inst_t **ready_reg = source_reg.get_ready(partition_issue, m_issue_reg_id);
+  bool partition_issue =
+      m_config->sub_core_model && this->is_issue_partitioned();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(partition_issue, m_issue_reg_id);
   m_core->incexecstat((*ready_reg));
   // source_reg.move_out_to(m_dispatch_reg);
   simd_function_unit::issue(source_reg);
@@ -3886,7 +3898,8 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   unsigned reg_id;
   if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
-    assert(m_num_warp_scheds <= m_cu.size() && m_cu.size() % m_num_warp_scheds == 0);
+    assert(m_num_warp_scheds <= m_cu.size() &&
+           m_cu.size() % m_num_warp_scheds == 0);
   }
   m_num_banks_per_sched =
       num_banks / shader->get_config()->gpgpu_num_sched_per_core;
@@ -3999,11 +4012,13 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         unsigned cuLowerBound = 0;
         unsigned cuUpperBound = cu_set.size();
         unsigned schd_id;
-        if(sub_core_model) {
-          // Sub core model only allocates on the subset of CUs assigned to the scheduler that issued
+        if (sub_core_model) {
+          // Sub core model only allocates on the subset of CUs assigned to the
+          // scheduler that issued
           unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
           schd_id = (*inp.m_in[i]).get_schd_id(reg_id);
-          assert(cu_set.size() % m_num_warp_scheds == 0 && cu_set.size() >= m_num_warp_scheds);
+          assert(cu_set.size() % m_num_warp_scheds == 0 &&
+                 cu_set.size() >= m_num_warp_scheds);
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
           cuLowerBound = schd_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
@@ -4019,8 +4034,9 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         if (allocated) break;  // cu has been allocated, no need to search more.
       }
-      //break;  // can only service a single input, if it failed it will fail for
-              // others.
+      // break;  // can only service a single input, if it failed it will fail
+      // for
+      // others.
     }
   }
 }
@@ -4067,7 +4083,8 @@ void opndcoll_rfu_t::allocate_reads() {
 }
 
 bool opndcoll_rfu_t::collector_unit_t::ready() const {
-  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free(m_sub_core_model, m_reg_id);
+  return (!m_free) && m_not_ready.none() &&
+         (*m_output_register).has_free(m_sub_core_model, m_reg_id);
 }
 
 void opndcoll_rfu_t::collector_unit_t::dump(
@@ -4085,13 +4102,10 @@ void opndcoll_rfu_t::collector_unit_t::dump(
   }
 }
 
-void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
-                                            unsigned log2_warp_size,
-                                            const core_config *config,
-                                            opndcoll_rfu_t *rfu,
-                                            bool sub_core_model,
-                                            unsigned reg_id,
-                                            unsigned banks_per_sched) {
+void opndcoll_rfu_t::collector_unit_t::init(
+    unsigned n, unsigned num_banks, unsigned log2_warp_size,
+    const core_config *config, opndcoll_rfu_t *rfu, bool sub_core_model,
+    unsigned reg_id, unsigned banks_per_sched) {
   m_rfu = rfu;
   m_cuid = n;
   m_num_banks = num_banks;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index fa71af36c..8c02fd7c1 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -238,7 +238,10 @@ class shd_warp_t {
   unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; }
   unsigned get_warp_id() const { return m_warp_id; }
 
-  class shader_core_ctx * get_shader() { return m_shader; }
+  class shader_core_ctx *get_shader() {
+    return m_shader;
+  }
+
  private:
   static const unsigned IBUFFER_SIZE = 2;
   class shader_core_ctx *m_shader;
@@ -883,7 +886,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     // modifiers
     void init(unsigned n, unsigned num_banks, unsigned log2_warp_size,
               const core_config *config, opndcoll_rfu_t *rfu,
-              bool m_sub_core_model, unsigned reg_id, unsigned num_banks_per_sched);
+              bool m_sub_core_model, unsigned reg_id,
+              unsigned num_banks_per_sched);
     bool allocate(register_set *pipeline_reg, register_set *output_reg);
 
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
@@ -907,7 +911,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
 
     unsigned m_num_banks_per_sched;
     bool m_sub_core_model;
-    unsigned m_reg_id; // if sub_core_model enabled, limit regs this cu can r/w
+    unsigned m_reg_id;  // if sub_core_model enabled, limit regs this cu can r/w
   };
 
   class dispatch_unit_t {
@@ -1051,7 +1055,7 @@ class simd_function_unit {
     return m_dispatch_reg->empty() && !occupied.test(inst.latency);
   }
   virtual bool is_issue_partitioned() = 0;
-  virtual unsigned get_issue_reg_id() = 0; 
+  virtual unsigned get_issue_reg_id() = 0;
   virtual bool stallable() const = 0;
   virtual void print(FILE *fp) const {
     fprintf(fp, "%s dispatch= ", m_name.c_str());
@@ -1109,8 +1113,8 @@ class pipelined_simd_unit : public simd_function_unit {
   warp_inst_t **m_pipeline_reg;
   register_set *m_result_port;
   class shader_core_ctx *m_core;
-  unsigned m_issue_reg_id; // if sub_core_model is enabled we can only issue from a
-                           // subset of operand collectors
+  unsigned m_issue_reg_id;  // if sub_core_model is enabled we can only issue
+                            // from a subset of operand collectors
 
   unsigned active_insts_in_pipeline;
 };
@@ -2145,8 +2149,8 @@ class shader_core_ctx : public core_t {
   friend class TwoLevelScheduler;
   friend class LooseRoundRobbinScheduler;
   virtual void issue_warp(register_set &warp, const warp_inst_t *pI,
-                  const active_mask_t &active_mask, unsigned warp_id,
-                  unsigned sch_id);
+                          const active_mask_t &active_mask, unsigned warp_id,
+                          unsigned sch_id);
 
   void create_front_pipeline();
   void create_schedulers();

From db1019769b9fb8776f3934e9ba5fd47437a5cee5 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Thu, 13 May 2021 11:39:33 -0400
Subject: [PATCH 035/133] rm old dirs in format-code.sh

---
 format-code.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/format-code.sh b/format-code.sh
index fb1cc909a..9f470854b 100755
--- a/format-code.sh
+++ b/format-code.sh
@@ -9,7 +9,4 @@ clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.h
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc
 clang-format -i ${THIS_DIR}/src/gpuwattch/*.h
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.cc
-clang-format -i ${THIS_DIR}/src/trace-driven/*.h
-clang-format -i ${THIS_DIR}/src/trace-driven/*.cc
-clang-format -i ${THIS_DIR}/src/trace-driven/ISA_Def/*.h
+clang-format -i ${THIS_DIR}/src/gpuwattch/*.cc
\ No newline at end of file

From c52626267907b42ac6b611d7d7d0eaae3c825600 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 13 May 2021 13:45:59 -0400
Subject: [PATCH 036/133] fix adaptive cache cfg option parsing data type

---
 src/gpgpu-sim/gpu-sim.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 165068879..fd36e006a 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -326,7 +326,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(
       opp, "-gpgpu_shmem_size", OPT_UINT32, &gpgpu_shmem_size,
       "Size of shared memory per shader core (default 16kB)", "16384");
-  option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_UINT32,
+  option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
   option_parser_register(
       opp, "-gpgpu_shmem_sizeDefault", OPT_UINT32, &gpgpu_shmem_sizeDefault,

From f2a7d9ce6cd13977d97a0601d732551a5451ac71 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Sat, 15 May 2021 09:09:20 -0400
Subject: [PATCH 037/133] fixing streaming cache based on recent ubench

---
 .../tested-cfgs/SM7_TITANV/gpgpusim.config    |  2 +-
 src/gpgpu-sim/gpu-cache.cc                    | 13 -------
 src/gpgpu-sim/gpu-cache.h                     | 38 +++++++++++--------
 src/gpgpu-sim/shader.cc                       | 15 ++++++++
 4 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 3fa51ee14..3af314c9e 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -116,7 +116,7 @@
 -gpgpu_adaptive_cache_config 1
 # Volta unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 1c36d224c..c6a125d8d 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -312,15 +312,6 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
     abort();  // if an unreserved block exists, it is either invalid or
               // replaceable
 
-  if (probe_mode && m_config.is_streaming()) {
-    line_table::const_iterator i =
-        pending_lines.find(m_config.block_addr(addr));
-    assert(mf);
-    if (!mf->is_write() && i != pending_lines.end()) {
-      if (i->second != mf->get_inst().get_uid()) return SECTOR_MISS;
-    }
-  }
-
   return MISS;
 }
 
@@ -1060,7 +1051,6 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
     m_tag_array->fill(e->second.m_cache_index, time, mf);
   else if (m_config.m_alloc_policy == ON_FILL) {
     m_tag_array->fill(e->second.m_block_addr, time, mf);
-    if (m_config.is_streaming()) m_tag_array->remove_pending_line(mf);
   } else
     abort();
   bool has_atomic = false;
@@ -1136,9 +1126,6 @@ void baseline_cache::send_read_request(new_addr_type addr,
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
 
     m_mshrs.add(mshr_addr, mf);
-    if (m_config.is_streaming() && m_config.m_cache_type == SECTOR) {
-      m_tag_array->add_pending_line(mf);
-    }
     m_extra_mf_fields[mf] = extra_mf_fields(
         mshr_addr, mf->get_addr(), cache_index, mf->get_data_size(), m_config);
     mf->set_data_size(m_config.get_atom_sz());
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 00c09ae55..aa0a7e85a 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -574,22 +574,26 @@ class cache_config {
         exit_parse_error();
     }
     if (m_alloc_policy == STREAMING) {
-      // For streaming cache, we set the alloc policy to be on-fill to remove
-      // all line_alloc_fail stalls we set the MSHRs to be equal to max
-      // allocated cache lines. This is possible by moving TAG to be shared
-      // between cache line and MSHR enrty (i.e. for each cache line, there is
-      // an MSHR rntey associated with it) This is the easiest think we can
-      // think about to model (mimic) L1 streaming cache in Pascal and Volta
-      // Based on our microbenchmakrs, MSHRs entries have been increasing
-      // substantially in Pascal and Volta For more information about streaming
-      // cache, see:
-      // http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
-      // https://ieeexplore.ieee.org/document/8344474/
+      /*
+      For streaming cache:
+      (1) we set the alloc policy to be on-fill to remove all line_alloc_fail stalls.
+      if the whole memory is allocated to the L1 cache, then make the allocation to be on_MISS
+      otherwise, make it ON_FILL to eliminate line allocation fails. 
+      i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
+      So, we set the allocation policy per kernel basis, see shader.cc, max_cta() function
+      
+      (2) We also set the MSHRs to be equal to max
+      allocated cache lines. This is possible by moving TAG to be shared
+      between cache line and MSHR enrty (i.e. for each cache line, there is
+      an MSHR rntey associated with it). This is the easiest think we can
+      think of to model (mimic) L1 streaming cache in Pascal and Volta
+      
+      For more information about streaming cache, see:
+      http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+      https://ieeexplore.ieee.org/document/8344474/
+      */
       m_is_streaming = true;
       m_alloc_policy = ON_FILL;
-      m_mshr_entries = m_nset * m_assoc * MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
-      if (m_cache_type == SECTOR) m_mshr_entries *= SECTOR_CHUNCK_SIZE;
-      m_mshr_max_merge = MAX_WARP_PER_SM;
     }
     switch (mshr_type) {
       case 'F':
@@ -638,7 +642,8 @@ class cache_config {
     }
 
     // detect invalid configuration
-    if (m_alloc_policy == ON_FILL and m_write_policy == WRITE_BACK) {
+    if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) 
+        and m_write_policy == WRITE_BACK) {
       // A writeback cache with allocate-on-fill policy will inevitably lead to
       // deadlock: The deadlock happens when an incoming cache-fill evicts a
       // dirty line, generating a writeback request.  If the memory subsystem is
@@ -750,6 +755,9 @@ class cache_config {
   }
   bool is_streaming() { return m_is_streaming; }
   FuncCache get_cache_status() { return cache_status; }
+  void set_allocation_policy(enum allocation_policy_t alloc) {
+    m_alloc_policy = alloc;
+  }
   char *m_config_string;
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c6e7b8f67..0ad9547b0 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3308,6 +3308,21 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
              m_L1D_config.get_total_size_inKB());
     }
 
+    if(m_L1D_config.is_streaming()) {
+      //for streaming cache, if the whole memory is allocated
+      //to the L1 cache, then make the allocation to be on_MISS
+      //otherwise, make it ON_FILL to eliminate line allocation fails
+      //i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
+      if(total_shmed == 0) {
+        m_L1D_config.set_allocation_policy(ON_MISS);
+        printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
+      }
+      else {
+        m_L1D_config.set_allocation_policy(ON_FILL);
+        printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
+      }
+    }
+
     k.cache_config_set = true;
   }
 

From 134739518a4a0f8a66cbf8c8a44b1a0ce178f7d5 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Sat, 15 May 2021 09:15:38 -0400
Subject: [PATCH 038/133] adding the missing xoring hashing

---
 src/gpgpu-sim/gpu-cache.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index aa0a7e85a..b2db1c5ff 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -689,6 +689,9 @@ class cache_config {
       case 'L':
         m_set_index_function = LINEAR_SET_FUNCTION;
         break;
+      case 'X':
+        m_set_index_function = BITWISE_XORING_FUNCTION;
+        break;        
       default:
         exit_parse_error();
     }

From 6319e31a8ee5ebac7499756029878a1ebbb4384e Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Sat, 15 May 2021 09:23:23 -0400
Subject: [PATCH 039/133] moving reg file read to read_operands function as
 before

---
 src/gpgpu-sim/shader.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 0ad9547b0..e6bfca042 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1569,7 +1569,10 @@ void swl_scheduler::order_warps() {
   }
 }
 
-void shader_core_ctx::read_operands() {}
+void shader_core_ctx::read_operands() {
+  for (int i = 0; i < m_config->reg_file_port_throughput; ++i)
+    m_operand_collector.step();
+}
 
 address_type coalesced_segment(address_type addr,
                                unsigned segment_size_lg2bytes) {
@@ -2550,8 +2553,7 @@ inst->space.get_type() != shared_space) { unsigned warp_id = inst->warp_id();
 */
 void ldst_unit::cycle() {
   writeback();
-  for (int i = 0; i < m_config->reg_file_port_throughput; ++i)
-    m_operand_collector->step();
+
   for (unsigned stage = 0; (stage + 1) < m_pipeline_depth; stage++)
     if (m_pipeline_reg[stage]->empty() && !m_pipeline_reg[stage + 1]->empty())
       move_warp(m_pipeline_reg[stage], m_pipeline_reg[stage + 1]);

From c94b883ac62e3b7dfbc69f6bad3b4c86b62eeb8c Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Mon, 17 May 2021 10:57:48 -0400
Subject: [PATCH 040/133] code refactoring cycle()

---
 src/gpgpu-sim/shader.cc | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index e6bfca042..34040fba0 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1228,22 +1228,6 @@ void scheduler_unit::cycle() {
                 previous_issued_inst_exec_type = exec_unit_type_t::MEM;
               }
             } else {
-              bool sp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sp_units > 0) &&
-                  m_sp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool sfu_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
-                  m_sfu_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool tensor_core_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
-                  m_tensor_core_out->has_free(
-                      m_shader->m_config->sub_core_model, m_id);
-              bool dp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_dp_units > 0) &&
-                  m_dp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool int_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_int_units > 0) &&
-                  m_int_out->has_free(m_shader->m_config->sub_core_model, m_id);
 
               // This code need to be refactored
               if (pI->op != TENSOR_CORE_OP && pI->op != SFU_OP &&
@@ -1251,6 +1235,13 @@ void scheduler_unit::cycle() {
                 bool execute_on_SP = false;
                 bool execute_on_INT = false;
 
+              bool sp_pipe_avail =
+                  (m_shader->m_config->gpgpu_num_sp_units > 0) &&
+                  m_sp_out->has_free(m_shader->m_config->sub_core_model, m_id);
+              bool int_pipe_avail =
+                  (m_shader->m_config->gpgpu_num_int_units > 0) &&
+                  m_int_out->has_free(m_shader->m_config->sub_core_model, m_id);
+                  
                 // if INT unit pipline exist, then execute ALU and INT
                 // operations on INT unit and SP-FPU on SP unit (like in Volta)
                 // if INT unit pipline does not exist, then execute all ALU, INT
@@ -1311,6 +1302,11 @@ void scheduler_unit::cycle() {
                          (pI->op == DP_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::DP)) {
+               
+                bool dp_pipe_avail =
+                (m_shader->m_config->gpgpu_num_dp_units > 0) &&
+                m_dp_out->has_free(m_shader->m_config->sub_core_model, m_id);
+
                 if (dp_pipe_avail) {
                   m_shader->issue_warp(*m_dp_out, pI, active_mask, warp_id,
                                        m_id);
@@ -1326,6 +1322,11 @@ void scheduler_unit::cycle() {
                         (pI->op == SFU_OP) || (pI->op == ALU_SFU_OP)) &&
                        !(diff_exec_units && previous_issued_inst_exec_type ==
                                                 exec_unit_type_t::SFU)) {
+
+                bool sfu_pipe_avail =
+                (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
+                m_sfu_out->has_free(m_shader->m_config->sub_core_model, m_id);
+
                 if (sfu_pipe_avail) {
                   m_shader->issue_warp(*m_sfu_out, pI, active_mask, warp_id,
                                        m_id);
@@ -1337,6 +1338,12 @@ void scheduler_unit::cycle() {
               } else if ((pI->op == TENSOR_CORE_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::TENSOR)) {
+                  
+                bool tensor_core_pipe_avail =
+                (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
+                m_tensor_core_out->has_free(
+                    m_shader->m_config->sub_core_model, m_id);
+
                 if (tensor_core_pipe_avail) {
                   m_shader->issue_warp(*m_tensor_core_out, pI, active_mask,
                                        warp_id, m_id);

From 7d9a12fb096db5492924ec32a96c9052552e8579 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 17 May 2021 12:46:35 -0400
Subject: [PATCH 041/133] specialized unit get_ready() was missing subcore

---
 src/gpgpu-sim/shader.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index e84e38d92..14d904424 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -2272,7 +2272,8 @@ void dp_unit ::issue(register_set &source_reg) {
 }
 
 void specialized_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = 
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SPECIALIZED__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);

From 0f3030542e1987543c5fd4e497f7d422422e73fa Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 15 Feb 2021 12:42:14 -0500
Subject: [PATCH 042/133] dirty counter added. NO increamenting yet

---
 src/gpgpu-sim/gpu-cache.cc | 24 +++++++++++++++---------
 src/gpgpu-sim/gpu-cache.h  |  6 ++++--
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 1c36d224c..763705f91 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -284,15 +284,20 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
         invalid_line = index;
       } else {
         // valid line : keep track of most appropriate replacement candidate
-        if (m_config.m_replacement_policy == LRU) {
-          if (line->get_last_access_time() < valid_timestamp) {
-            valid_timestamp = line->get_last_access_time();
-            valid_line = index;
-          }
-        } else if (m_config.m_replacement_policy == FIFO) {
-          if (line->get_alloc_time() < valid_timestamp) {
-            valid_timestamp = line->get_alloc_time();
-            valid_line = index;
+        if (!line->get_status(mask) == MODIFIED || 
+            100 * m_dirty/(m_config.m_nset * m_config.m_assoc) >= m_config.m_wr_percent) {
+              // don't evict write until dirty lines reach threshold
+              // make sure at least 1 candidate is assigned
+          if (m_config.m_replacement_policy == LRU) {
+            if (line->get_last_access_time() < valid_timestamp) {
+              valid_timestamp = line->get_last_access_time();
+              valid_line = index;
+            }
+          } else if (m_config.m_replacement_policy == FIFO) {
+            if (line->get_alloc_time() < valid_timestamp) {
+              valid_timestamp = line->get_alloc_time();
+              valid_line = index;
+            }
           }
         }
       }
@@ -418,6 +423,7 @@ void tag_array::flush() {
     if (m_lines[i]->is_modified_line()) {
       for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
+        m_dirty--;
     }
 
   is_used = false;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 00c09ae55..9dbfe8251 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -498,10 +498,10 @@ class cache_config {
     char ct, rp, wp, ap, mshr_type, wap, sif;
 
     int ntok =
-        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", &ct,
+        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u,%u", &ct,
                &m_nset, &m_line_sz, &m_assoc, &rp, &wp, &ap, &wap, &sif,
                &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
-               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width);
+               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width, &m_wr_percent);
 
     if (ntok < 12) {
       if (!strcmp(config, "none")) {
@@ -801,6 +801,7 @@ class cache_config {
   unsigned m_data_port_width;  //< number of byte the cache can access per cycle
   enum set_index_function
       m_set_index_function;  // Hash, linear, or custom set index function
+  unsigned m_wr_percent;
 
   friend class tag_array;
   friend class baseline_cache;
@@ -897,6 +898,7 @@ class tag_array {
                            // allocated but not filled
   unsigned m_res_fail;
   unsigned m_sector_miss;
+  unsigned m_dirty;
 
   // performance counters for calculating the amount of misses within a time
   // window

From 615f173c25883fbc8db0363279e2eb216acb8c7e Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Sat, 20 Feb 2021 16:03:42 -0500
Subject: [PATCH 043/133] store ack for new waps

---
 src/gpgpu-sim/gpu-cache.h |  6 ++++++
 src/gpgpu-sim/shader.cc   | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 9dbfe8251..381ce944e 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -754,6 +754,9 @@ class cache_config {
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
+  write_allocate_policy_t get_write_allocate_policy() {
+    return m_write_alloc_policy;
+  }
 
  protected:
   void exit_parse_error() {
@@ -878,6 +881,9 @@ class tag_array {
   void update_cache_parameters(cache_config &config);
   void add_pending_line(mem_fetch *mf);
   void remove_pending_line(mem_fetch *mf);
+  void inc_dirty() {
+    m_dirty++;
+  }
 
  protected:
   // This constructor is intended for use only from derived classes that wish to
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 14d904424..4769ca885 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1974,6 +1974,18 @@ void ldst_unit::L1_latency_queue_cycle() {
       } else {
         assert(status == MISS || status == HIT_RESERVED);
         l1_latency_queue[j][0] = NULL;
+        if (mf_next->get_inst().is_store() &&
+            (m_config->m_L1D_config.get_write_allocate_policy() == FETCH_ON_WRITE ||
+              m_config->m_L1D_config.get_write_allocate_policy() == LAZY_FETCH_ON_READ) &&
+            !was_writeallocate_sent(events)) {
+          unsigned dec_ack =
+              (m_config->m_L1D_config.get_mshr_type() == SECTOR_ASSOC)
+                  ? (mf_next->get_data_size() / SECTOR_SIZE)
+                  : 1;
+          mf_next->set_reply();
+          for (unsigned i = 0; i < dec_ack; ++i) m_core->store_ack(mf_next);
+          if (!write_sent && !read_sent) delete mf_next;
+        }
       }
     }
 

From ad7204189b79be89575d969b305c529a31a2a765 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 2 Mar 2021 16:30:27 -0500
Subject: [PATCH 044/133] sending cache block byte mask

---
 src/abstract_hardware_model.h |  6 ++++++
 src/gpgpu-sim/gpu-cache.cc    | 21 ++++++++++++++++-----
 src/gpgpu-sim/gpu-cache.h     | 28 ++++++++++++++++++++++++++++
 src/gpgpu-sim/l2cache.cc      | 14 ++++++++++++++
 src/gpgpu-sim/l2cache.h       |  6 ++++++
 src/gpgpu-sim/shader.cc       | 15 +++++++++++++++
 src/gpgpu-sim/shader.h        |  6 ++++++
 7 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 982e41606..e09acdbf8 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -869,6 +869,12 @@ class mem_fetch_allocator {
   virtual mem_fetch *alloc(const class warp_inst_t &inst,
                            const mem_access_t &access,
                            unsigned long long cycle) const = 0;
+  virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle) const = 0;                    
 };
 
 // the maximum number of destination, source, or address uarch operands in a
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 763705f91..ded800461 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -358,8 +358,13 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       if (m_config.m_alloc_policy == ON_MISS) {
         if (m_lines[idx]->is_modified_line()) {
           wb = true;
+          ((sector_cache_block *)m_lines[idx])->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
-                           m_lines[idx]->get_modified_size());
+                           m_lines[idx]->get_modified_size(),
+                           ((sector_cache_block *)m_lines[idx])
+                                              ->get_byte_mask(),
+                            ((sector_cache_block *)m_lines[idx])
+                                              ->get_sector_mask());
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
@@ -1464,6 +1469,8 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  block->set_status(MODIFIED, mf->get_access_sector_mask());
+  ((sector_cache_block *)block)->set_byte_mask(mf);
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
@@ -1484,8 +1491,10 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1560,8 +1569,10 @@ enum cache_request_status data_cache::rd_miss_base(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 381ce944e..042c1d6b7 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -72,14 +72,26 @@ enum cache_event_type {
 struct evicted_block_info {
   new_addr_type m_block_addr;
   unsigned m_modified_size;
+  mem_access_byte_mask_t m_byte_mask;
+  mem_access_sector_mask_t m_sector_mask;
   evicted_block_info() {
     m_block_addr = 0;
     m_modified_size = 0;
+    m_byte_mask.reset();
+    m_sector_mask.reset();
   }
   void set_info(new_addr_type block_addr, unsigned modified_size) {
     m_block_addr = block_addr;
     m_modified_size = modified_size;
   }
+  void set_info(new_addr_type block_addr, unsigned modified_size, 
+                mem_access_byte_mask_t byte_mask,
+                mem_access_sector_mask_t sector_mask) {
+    m_block_addr = block_addr;
+    m_modified_size = modified_size;
+    m_byte_mask = byte_mask;
+    m_sector_mask = sector_mask;
+  }
 };
 
 struct cache_event {
@@ -251,6 +263,7 @@ struct sector_cache_block : public cache_block_t {
     m_line_alloc_time = 0;
     m_line_last_access_time = 0;
     m_line_fill_time = 0;
+    m_byte_mask.reset();
   }
 
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
@@ -362,6 +375,20 @@ struct sector_cache_block : public cache_block_t {
     m_status[sidx] = status;
   }
 
+  virtual void set_byte_mask(mem_fetch *mf) {
+    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();;
+  }
+  virtual mem_access_byte_mask_t get_byte_mask() {
+    return m_byte_mask;
+  }
+  virtual mem_access_sector_mask_t get_sector_mask() {
+    mem_access_sector_mask_t sector_mask;
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      if (m_status[i] == MODIFIED) 
+        sector_mask.set(i);
+    }
+    return sector_mask;
+  }
   virtual unsigned long long get_last_access_time() {
     return m_line_last_access_time;
   }
@@ -429,6 +456,7 @@ struct sector_cache_block : public cache_block_t {
   bool m_set_modified_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_readable[SECTOR_CHUNCK_SIZE];
+  mem_access_byte_mask_t m_byte_mask;
 
   unsigned get_sector_index(mem_access_sector_mask_t sector_mask) {
     assert(sector_mask.count() == 1);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index ab6e5c228..cd04af57a 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -57,6 +57,20 @@ mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
   return mf;
 }
 
+mem_fetch *partition_mf_allocator::alloc(new_addr_type addr, 
+                            mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
+                        sector_mask, m_memory_config->gpgpu_ctx);
+  mem_fetch *mf =
+    new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
+                  -1, -1, m_memory_config, cycle);
+    return mf;
+}
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
                                              const memory_config *config,
                                              class memory_stats_t *stats,
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 3152db337..1f5d7c468 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -51,6 +51,12 @@ class partition_mf_allocator : public mem_fetch_allocator {
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
                            unsigned size, bool wr,
                            unsigned long long cycle) const;
+  virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle) const;
 
  private:
   const memory_config *m_memory_config;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4769ca885..4b4c98db7 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -61,6 +61,21 @@ mem_fetch *shader_core_mem_fetch_allocator::alloc(
                     m_core_id, m_cluster_id, m_memory_config, cycle);
   return mf;
 }
+
+mem_fetch *shader_core_mem_fetch_allocator::alloc(
+  new_addr_type addr, mem_access_type type,
+  const active_mask_t &active_mask,
+  const mem_access_byte_mask_t &byte_mask,
+  const mem_access_sector_mask_t &sector_mask,
+  unsigned size, bool wr,
+  unsigned long long cycle) const {
+    mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
+                          sector_mask, m_memory_config->gpgpu_ctx);
+    mem_fetch *mf =
+      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
+                    m_core_id, m_cluster_id, m_memory_config, cycle);
+      return mf;
+  }
 /////////////////////////////////////////////////////////////////////////////
 
 std::list<unsigned> shader_core_ctx::get_regs_written(const inst_t &fvt) const {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 8c02fd7c1..a7a2c02d6 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1872,6 +1872,12 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
   }
   mem_fetch *alloc(new_addr_type addr, mem_access_type type, unsigned size,
                    bool wr, unsigned long long cycle) const;
+  mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;

From bb19c0cbfa2dc8082496a279f37f48695b7c4185 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 2 Mar 2021 16:32:29 -0500
Subject: [PATCH 045/133] update mf breakdown at L2

---
 src/gpgpu-sim/l2cache.cc | 92 +++++++++++++---------------------------
 1 file changed, 29 insertions(+), 63 deletions(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index cd04af57a..63119ee90 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -555,10 +555,15 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
                m_config->m_L2_config.m_write_alloc_policy ==
                    LAZY_FETCH_ON_READ) &&
               !was_writeallocate_sent(events)) {
-            mf->set_reply();
-            mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
-                           m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-            m_L2_icnt_queue->push(mf);
+            if (mf->get_access_type() == L1_WRBK_ACC) {
+              m_request_tracker.erase(mf);
+              delete mf;
+            } else {
+              mf->set_reply();
+              mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
+                             m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+              m_L2_icnt_queue->push(mf);
+            }
           }
           // L2 cache accepted request
           m_icnt_L2_queue->pop();
@@ -708,71 +713,32 @@ bool memory_sub_partition::busy() const { return !m_request_tracker.empty(); }
 std::vector<mem_fetch *>
 memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
   std::vector<mem_fetch *> result;
-
+  mem_access_sector_mask_t sector_mask = mf->get_access_sector_mask();
   if (mf->get_data_size() == SECTOR_SIZE &&
       mf->get_access_sector_mask().count() == 1) {
     result.push_back(mf);
-  } else if (mf->get_data_size() == 128 || mf->get_data_size() == 64) {
-    // We only accept 32, 64 and 128 bytes reqs
-    unsigned start = 0, end = 0;
-    if (mf->get_data_size() == 128) {
-      start = 0;
-      end = 3;
-    } else if (mf->get_data_size() == 64 &&
-               mf->get_access_sector_mask().to_string() == "1100") {
-      start = 2;
-      end = 3;
-    } else if (mf->get_data_size() == 64 &&
-               mf->get_access_sector_mask().to_string() == "0011") {
-      start = 0;
-      end = 1;
-    } else if (mf->get_data_size() == 64 &&
-               (mf->get_access_sector_mask().to_string() == "1111" ||
-                mf->get_access_sector_mask().to_string() == "0000")) {
-      if (mf->get_addr() % 128 == 0) {
-        start = 0;
-        end = 1;
-      } else {
-        start = 2;
-        end = 3;
+  } else {
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      if (sector_mask.test(i)) {
+        mem_access_byte_mask_t mask;
+        for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+          mask.set(k);
+        }
+        const mem_access_t *ma = new mem_access_t(
+            mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i,
+            SECTOR_SIZE, mf->is_write(), mf->get_access_warp_mask(),
+            mf->get_access_byte_mask() & mask,
+            std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
+
+        mem_fetch *n_mf =
+            new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
+                          mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
+                          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+
+        result.push_back(n_mf);
       }
-    } else {
-      printf(
-          "Invalid sector received, address = 0x%06llx, sector mask = %s, data "
-          "size = %d",
-          mf->get_addr(), mf->get_access_sector_mask(), mf->get_data_size());
-      assert(0 && "Undefined sector mask is received");
     }
-
-    std::bitset<SECTOR_SIZE * SECTOR_CHUNCK_SIZE> byte_sector_mask;
-    byte_sector_mask.reset();
-    for (unsigned k = start * SECTOR_SIZE; k < SECTOR_SIZE; ++k)
-      byte_sector_mask.set(k);
-
-    for (unsigned j = start, i = 0; j <= end; ++j, ++i) {
-      const mem_access_t *ma = new mem_access_t(
-          mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i, SECTOR_SIZE,
-          mf->is_write(), mf->get_access_warp_mask(),
-          mf->get_access_byte_mask() & byte_sector_mask,
-          std::bitset<SECTOR_CHUNCK_SIZE>().set(j), m_gpu->gpgpu_ctx);
-
-      mem_fetch *n_mf =
-          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
-
-      result.push_back(n_mf);
-      byte_sector_mask <<= SECTOR_SIZE;
-    }
-  } else {
-    printf(
-        "Invalid sector received, address = 0x%06llx, sector mask = %d, byte "
-        "mask = , data size = %u",
-        mf->get_addr(), mf->get_access_sector_mask().count(),
-        mf->get_data_size());
-    assert(0 && "Undefined data size is received");
   }
-
   return result;
 }
 

From e05fa4a676c2b082f1ebb34d051f43ad05d4a82c Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 2 Mar 2021 16:33:30 -0500
Subject: [PATCH 046/133] little bug fix - flush()

---
 src/gpgpu-sim/gpu-cache.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index ded800461..8d44f151e 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -426,9 +426,10 @@ void tag_array::flush() {
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
-      for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
+      for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
         m_dirty--;
+      }
     }
 
   is_used = false;

From 804ee9033d5c0d8f4e0b974734c4db42b55bd1dc Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 8 Mar 2021 13:29:13 -0500
Subject: [PATCH 047/133] sending byte mask for all policies

---
 src/gpgpu-sim/gpu-cache.cc | 35 ++++++++++++++++++++++-------------
 src/gpgpu-sim/gpu-cache.h  | 17 +++++++++++++++--
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 8d44f151e..2cc75bbf7 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -358,13 +358,11 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       if (m_config.m_alloc_policy == ON_MISS) {
         if (m_lines[idx]->is_modified_line()) {
           wb = true;
-          ((sector_cache_block *)m_lines[idx])->set_byte_mask(mf);
+          m_lines[idx]->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
-                           ((sector_cache_block *)m_lines[idx])
-                                              ->get_byte_mask(),
-                            ((sector_cache_block *)m_lines[idx])
-                                              ->get_sector_mask());
+                           m_lines[idx]->get_byte_mask(),
+                            m_lines[idx]->get_sector_mask());
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
@@ -1083,6 +1081,7 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
     block->set_status(MODIFIED,
                       mf->get_access_sector_mask());  // mark line as dirty for
                                                       // atomic operation
+    block->set_byte_mask(mf);
   }
   m_extra_mf_fields.erase(mf);
   m_bandwidth_management.use_fill_port(mf);
@@ -1189,6 +1188,7 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
 
   return HIT;
 }
@@ -1208,6 +1208,7 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
 
   // generate a write-through
   send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
@@ -1317,8 +1318,10 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
       assert(status ==
              MISS);  // SECTOR_MISS and HIT_RESERVED should not send write back
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1356,6 +1359,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
     assert(status != HIT);
     cache_block_t *block = m_tag_array->get_block(cache_index);
     block->set_status(MODIFIED, mf->get_access_sector_mask());
+    block->set_byte_mask(mf);
     if (status == HIT_RESERVED)
       block->set_ignore_on_fill(true, mf->get_access_sector_mask());
 
@@ -1364,8 +1368,10 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-            evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-            m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1434,8 +1440,10 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-            evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-            m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1471,7 +1479,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
   block->set_status(MODIFIED, mf->get_access_sector_mask());
-  ((sector_cache_block *)block)->set_byte_mask(mf);
+  block->set_byte_mask(mf);
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
@@ -1539,7 +1547,8 @@ enum cache_request_status data_cache::rd_hit_base(
     assert(mf->get_access_type() == GLOBAL_ACC_R);
     cache_block_t *block = m_tag_array->get_block(cache_index);
     block->set_status(MODIFIED,
-                      mf->get_access_sector_mask());  // mark line as dirty
+                      mf->get_access_sector_mask());  // mark line as 
+    block->set_byte_mask(mf);
   }
   return HIT;
 }
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 042c1d6b7..eb811d740 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -132,7 +132,9 @@ struct cache_block_t {
       mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_status(enum cache_block_state m_status,
                           mem_access_sector_mask_t sector_mask) = 0;
-
+  virtual void set_byte_mask(mem_fetch *mf) = 0;
+  virtual mem_access_byte_mask_t get_byte_mask() = 0;
+  virtual mem_access_sector_mask_t get_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
   virtual void set_last_access_time(unsigned long long time,
                                     mem_access_sector_mask_t sector_mask) = 0;
@@ -201,6 +203,17 @@ struct line_cache_block : public cache_block_t {
                           mem_access_sector_mask_t sector_mask) {
     m_status = status;
   }
+  virtual void set_byte_mask(mem_fetch *mf) {
+    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
+  }
+  virtual mem_access_byte_mask_t get_byte_mask() {
+    return m_byte_mask;
+  }
+  virtual mem_access_sector_mask_t get_sector_mask() {
+    mem_access_sector_mask_t sector_mask;
+    if (m_status == MODIFIED) sector_mask.set();
+    return sector_mask;
+  }
   virtual unsigned long long get_last_access_time() {
     return m_last_access_time;
   }
@@ -244,6 +257,7 @@ struct line_cache_block : public cache_block_t {
   bool m_set_modified_on_fill;
   bool m_set_readable_on_fill;
   bool m_readable;
+  mem_access_byte_mask_t m_byte_mask;
 };
 
 struct sector_cache_block : public cache_block_t {
@@ -328,7 +342,6 @@ struct sector_cache_block : public cache_block_t {
 
     //	if(!m_ignore_on_fill_status[sidx])
     //	         assert( m_status[sidx] == RESERVED );
-
     m_status[sidx] = m_set_modified_on_fill[sidx] ? MODIFIED : VALID;
     
     if (m_set_readable_on_fill[sidx]) {

From b3dab5eec75f11c600bddc9a6dd3b22272363cca Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 8 Mar 2021 15:02:58 -0500
Subject: [PATCH 048/133] set byte mask on fill

---
 src/gpgpu-sim/gpu-cache.cc | 12 ++++++------
 src/gpgpu-sim/gpu-cache.h  | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 2cc75bbf7..46813e742 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -392,11 +392,11 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf) {
-  fill(addr, time, mf->get_access_sector_mask());
+  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time,
-                     mem_access_sector_mask_t mask) {
+                     mem_access_sector_mask_t mask, mem_access_byte_mask_t byte_mask) {
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
   enum cache_request_status status = probe(addr, idx, mask);
@@ -410,12 +410,12 @@ void tag_array::fill(new_addr_type addr, unsigned time,
     ((sector_cache_block *)m_lines[idx])->allocate_sector(time, mask);
   }
 
-  m_lines[idx]->fill(time, mask);
+  m_lines[idx]->fill(time, mask, byte_mask);
 }
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  m_lines[index]->fill(time, mf->get_access_sector_mask());
+  m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
 }
 
 // TODO: we need write back the flushed data to the upper level
@@ -1432,6 +1432,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
 
     cache_block_t *block = m_tag_array->get_block(cache_index);
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
+    block->set_byte_mask_on_fill(true);
 
     events.push_back(cache_event(WRITE_ALLOCATE_SENT));
 
@@ -1483,8 +1484,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
-  } else {
-    block->set_status(MODIFIED, mf->get_access_sector_mask());
+    block->set_byte_mask_on_fill(true);
   }
 
   if (mf->get_access_byte_mask().count() == m_config.get_atom_sz()) {
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index eb811d740..a84ddd18a 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -121,7 +121,8 @@ struct cache_block_t {
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
                         unsigned time,
                         mem_access_sector_mask_t sector_mask) = 0;
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask) = 0;
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask, 
+                      mem_access_byte_mask_t byte_mask) = 0;
 
   virtual bool is_invalid_line() = 0;
   virtual bool is_valid_line() = 0;
@@ -133,6 +134,7 @@ struct cache_block_t {
   virtual void set_status(enum cache_block_state m_status,
                           mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_byte_mask(mem_fetch *mf) = 0;
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) = 0;
   virtual mem_access_byte_mask_t get_byte_mask() = 0;
   virtual mem_access_sector_mask_t get_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
@@ -145,6 +147,7 @@ struct cache_block_t {
                                     mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_readable_on_fill(bool readable,
                                     mem_access_sector_mask_t sector_mask) = 0;
+  virtual void set_byte_mask_on_fill(bool m_modified) = 0;
   virtual unsigned get_modified_size() = 0;
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) = 0;
@@ -178,8 +181,10 @@ struct line_cache_block : public cache_block_t {
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
     m_set_readable_on_fill = false;
+    m_set_byte_mask_on_fill = false;
   }
-  void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask, 
+              mem_access_byte_mask_t byte_mask) {
     // if(!m_ignore_on_fill_status)
     //	assert( m_status == RESERVED );
 
@@ -187,6 +192,7 @@ struct line_cache_block : public cache_block_t {
     
     if (m_set_readable_on_fill)
         m_readable = true;
+    if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
 
     m_fill_time = time;
   }
@@ -206,6 +212,9 @@ struct line_cache_block : public cache_block_t {
   virtual void set_byte_mask(mem_fetch *mf) {
     m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
   }
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
+    m_byte_mask = m_byte_mask | byte_mask;
+  }
   virtual mem_access_byte_mask_t get_byte_mask() {
     return m_byte_mask;
   }
@@ -234,6 +243,9 @@ struct line_cache_block : public cache_block_t {
                                     mem_access_sector_mask_t sector_mask) {
     m_set_readable_on_fill = readable;
   }
+  virtual void set_byte_mask_on_fill(bool m_modified) {
+    m_set_byte_mask_on_fill = m_modified;
+  }
   virtual unsigned get_modified_size() {
     return SECTOR_CHUNCK_SIZE * SECTOR_SIZE;  // i.e. cache line size
   }
@@ -256,6 +268,7 @@ struct line_cache_block : public cache_block_t {
   bool m_ignore_on_fill_status;
   bool m_set_modified_on_fill;
   bool m_set_readable_on_fill;
+  bool m_set_byte_mask_on_fill;
   bool m_readable;
   mem_access_byte_mask_t m_byte_mask;
 };
@@ -303,6 +316,7 @@ struct sector_cache_block : public cache_block_t {
     m_ignore_on_fill_status[sidx] = false;
     m_set_modified_on_fill[sidx] = false;
     m_set_readable_on_fill[sidx] = false;
+    m_set_byte_mask_on_fill = false;
 
     // set line stats
     m_line_alloc_time = time;  // only set this for the first allocated sector
@@ -337,7 +351,8 @@ struct sector_cache_block : public cache_block_t {
     m_line_fill_time = 0;
   }
 
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) {
     unsigned sidx = get_sector_index(sector_mask);
 
     //	if(!m_ignore_on_fill_status[sidx])
@@ -348,6 +363,7 @@ struct sector_cache_block : public cache_block_t {
         m_readable[sidx] = true;
         m_set_readable_on_fill[sidx] = false;
     }
+    if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
 
     m_sector_fill_time[sidx] = time;
     m_line_fill_time = time;
@@ -389,7 +405,10 @@ struct sector_cache_block : public cache_block_t {
   }
 
   virtual void set_byte_mask(mem_fetch *mf) {
-    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();;
+    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
+  }
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
+    m_byte_mask = m_byte_mask | byte_mask;
   }
   virtual mem_access_byte_mask_t get_byte_mask() {
     return m_byte_mask;
@@ -427,6 +446,9 @@ struct sector_cache_block : public cache_block_t {
     unsigned sidx = get_sector_index(sector_mask);
     m_set_modified_on_fill[sidx] = m_modified;
   }
+  virtual void set_byte_mask_on_fill(bool m_modified) {
+    m_set_byte_mask_on_fill = m_modified;
+  }
 
   virtual void set_readable_on_fill(bool readable,
                                     mem_access_sector_mask_t sector_mask) {
@@ -468,6 +490,7 @@ struct sector_cache_block : public cache_block_t {
   bool m_ignore_on_fill_status[SECTOR_CHUNCK_SIZE];
   bool m_set_modified_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
+  bool m_set_byte_mask_on_fill;
   bool m_readable[SECTOR_CHUNCK_SIZE];
   mem_access_byte_mask_t m_byte_mask;
 
@@ -904,7 +927,8 @@ class tag_array {
 
   void fill(new_addr_type addr, unsigned time, mem_fetch *mf);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
-  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask);
+  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask, 
+            mem_access_byte_mask_t byte_mask);
 
   unsigned size() const { return m_config.get_num_lines(); }
   cache_block_t *get_block(unsigned idx) { return m_lines[idx]; }
@@ -1291,7 +1315,8 @@ class baseline_cache : public cache_t {
   // something is read or written without doing anything else.
   void force_tag_access(new_addr_type addr, unsigned time,
                         mem_access_sector_mask_t mask) {
-    m_tag_array->fill(addr, time, mask);
+    mem_access_byte_mask_t byte_mask;
+    m_tag_array->fill(addr, time, mask, byte_mask);
   }
 
  protected:

From 40077df94f1afcfaabdc9599d7a2c25d3d98da8a Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 8 Mar 2021 17:58:02 -0500
Subject: [PATCH 049/133] solve deadlock for non-sectored cache configs

---
 src/gpgpu-sim/l2cache.cc | 47 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 63119ee90..00b14d7f6 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -717,6 +717,52 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
   if (mf->get_data_size() == SECTOR_SIZE &&
       mf->get_access_sector_mask().count() == 1) {
     result.push_back(mf);
+  } else if (mf->get_data_size() == 128) {
+    // break down every sector
+    mem_access_byte_mask_t mask;
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        mask.set(k);
+      }
+      const mem_access_t *ma = new mem_access_t(
+          mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i, SECTOR_SIZE,
+          mf->is_write(), mf->get_access_warp_mask(),
+          mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
+
+      mem_fetch *n_mf =
+          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
+                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
+                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+
+      result.push_back(n_mf);
+    }
+  } else if (mf->get_data_size() == 64 &&
+             (mf->get_access_sector_mask().to_string() == "1111" ||
+              mf->get_access_sector_mask().to_string() == "0000")) {
+    unsigned start;
+    if (mf->get_addr() % 128 == 0)
+      start = 0;
+    else
+      start = 2;
+    mem_access_byte_mask_t mask;
+    for (unsigned i = start; i < start + 2; i++) {
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        mask.set(k);
+      }
+      const mem_access_t *ma = new mem_access_t(
+          mf->get_access_type(), mf->get_addr(), SECTOR_SIZE,
+          mf->is_write(), mf->get_access_warp_mask(),
+          mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
+
+      mem_fetch *n_mf =
+          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
+                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
+                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+
+      result.push_back(n_mf);
+    }
   } else {
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
       if (sector_mask.test(i)) {
@@ -739,6 +785,7 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       }
     }
   }
+  if (result.size() == 0) assert(0 && "no mf sent");
   return result;
 }
 

From 64bf6fd7a44a32773389e900862bd9c0527a87e9 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 18 Mar 2021 13:31:41 -0400
Subject: [PATCH 050/133] dirty counter not resetting after kernel finish

---
 src/gpgpu-sim/gpu-cache.cc | 80 +++++++++++++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 13 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 46813e742..5ac202cea 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -210,6 +210,7 @@ void tag_array::init(int core_id, int type_id) {
   m_core_id = core_id;
   m_type_id = type_id;
   is_used = false;
+  m_dirty = 0;
 }
 
 void tag_array::add_pending_line(mem_fetch *mf) {
@@ -250,7 +251,22 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
   unsigned long long valid_timestamp = (unsigned)-1;
 
   bool all_reserved = true;
+  unsigned count = 0;
+  if (m_config.m_wr_percent == (unsigned)25) {
+    for (unsigned i = 0; i < m_config.m_nset * m_config.m_assoc; i++) {
+      if (m_lines[i]->is_modified_line()) {
+        m_lines[i]->is_modified_line();
+        count++;
+      }
+    }
+    if (count != m_dirty) {
+      printf("count = %u, m_dirty = %u",count,m_dirty);
+      fflush(stdout);
+      assert(0 && "m_dirty miss match");
+      printf("count = %u, m_dirty = %u",count,m_dirty);
 
+    }
+  }
   // check for hit or pending hit
   for (unsigned way = 0; way < m_config.m_assoc; way++) {
     unsigned index = set_index * m_config.m_assoc + way;
@@ -279,15 +295,17 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       }
     }
     if (!line->is_reserved_line()) {
-      all_reserved = false;
-      if (line->is_invalid_line()) {
-        invalid_line = index;
-      } else {
-        // valid line : keep track of most appropriate replacement candidate
-        if (!line->get_status(mask) == MODIFIED || 
-            100 * m_dirty/(m_config.m_nset * m_config.m_assoc) >= m_config.m_wr_percent) {
-              // don't evict write until dirty lines reach threshold
-              // make sure at least 1 candidate is assigned
+      if (!line->is_modified_line() ||
+          100 * m_dirty / (m_config.m_nset * m_config.m_assoc) >=
+              m_config.m_wr_percent) {
+        all_reserved = false;
+        if (line->is_invalid_line()) {
+          invalid_line = index;
+        } else {
+          // valid line : keep track of most appropriate replacement candidate
+
+          // don't evict write until dirty lines reach threshold
+          // make sure at least 1 candidate is assigned
           if (m_config.m_replacement_policy == LRU) {
             if (line->get_last_access_time() < valid_timestamp) {
               valid_timestamp = line->get_last_access_time();
@@ -363,6 +381,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
                            m_lines[idx]->get_modified_size(),
                            m_lines[idx]->get_byte_mask(),
                             m_lines[idx]->get_sector_mask());
+          m_dirty--;
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
@@ -373,8 +392,12 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       m_sector_miss++;
       shader_cache_access_log(m_core_id, m_type_id, 1);  // log cache misses
       if (m_config.m_alloc_policy == ON_MISS) {
+        bool before = m_lines[idx]->is_modified_line();
         ((sector_cache_block *)m_lines[idx])
             ->allocate_sector(time, mf->get_access_sector_mask());
+            if (before && !m_lines[idx]->is_modified_line()) {
+              m_dirty--;
+            }
       }
       break;
     case RESERVATION_FAIL:
@@ -400,22 +423,35 @@ void tag_array::fill(new_addr_type addr, unsigned time,
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
   enum cache_request_status status = probe(addr, idx, mask);
+  bool before = false;
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
-  if (status == MISS)
+  if (status == MISS) {
+    before = m_lines[idx]->is_modified_line();
     m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr), time,
                            mask);
-  else if (status == SECTOR_MISS) {
+  } else if (status == SECTOR_MISS) {
     assert(m_config.m_cache_type == SECTOR);
+    before = m_lines[idx]->is_modified_line();
     ((sector_cache_block *)m_lines[idx])->allocate_sector(time, mask);
   }
-
+  if (before && !m_lines[idx]->is_modified_line()) {
+    m_dirty--;
+  }
+  before = m_lines[idx]->is_modified_line();
   m_lines[idx]->fill(time, mask, byte_mask);
+  if (m_lines[idx]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
+  bool before = m_lines[index]->is_modified_line();
   m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+  if (m_lines[index]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 // TODO: we need write back the flushed data to the upper level
@@ -424,9 +460,9 @@ void tag_array::flush() {
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
+      m_dirty--;
       for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
-        m_dirty--;
       }
     }
 
@@ -1078,6 +1114,9 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   if (has_atomic) {
     assert(m_config.m_alloc_policy == ON_MISS);
     cache_block_t *block = m_tag_array->get_block(e->second.m_cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED,
                       mf->get_access_sector_mask());  // mark line as dirty for
                                                       // atomic operation
@@ -1187,6 +1226,9 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   new_addr_type block_addr = m_config.block_addr(addr);
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
 
@@ -1207,6 +1249,9 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
   new_addr_type block_addr = m_config.block_addr(addr);
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
 
@@ -1358,6 +1403,9 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
     assert(status != HIT);
     cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED, mf->get_access_sector_mask());
     block->set_byte_mask(mf);
     if (status == HIT_RESERVED)
@@ -1479,6 +1527,9 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
   if (m_status == HIT_RESERVED) {
@@ -1546,6 +1597,9 @@ enum cache_request_status data_cache::rd_hit_base(
   if (mf->isatomic()) {
     assert(mf->get_access_type() == GLOBAL_ACC_R);
     cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED,
                       mf->get_access_sector_mask());  // mark line as 
     block->set_byte_mask(mf);

From a374b330ac3bec0b47ce588adf72af89e5cd9307 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Fri, 26 Mar 2021 16:33:41 -0400
Subject: [PATCH 051/133] remove MSHR_HIT from cache total access

---
 src/gpgpu-sim/gpu-cache.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 5ac202cea..d2f9fef9c 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -819,7 +819,7 @@ void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
               cache_request_status_str((enum cache_request_status)status),
               m_stats[type][status]);
 
-      if (status != RESERVATION_FAIL)
+      if (status != RESERVATION_FAIL && status != MSHR_HIT)
         total_access[type] += m_stats[type][status];
     }
   }

From f6fb56ba32141030803ecfe01b52a6f6c93d8e6c Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 6 Apr 2021 15:46:03 -0400
Subject: [PATCH 052/133] check sector readable only on reads

---
 src/gpgpu-sim/gpu-cache.cc | 27 ++++++++++++++-------------
 src/gpgpu-sim/gpu-cache.h  | 10 ++++++----
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index d2f9fef9c..9c65476b1 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -232,15 +232,15 @@ void tag_array::remove_pending_line(mem_fetch *mf) {
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
-                                           mem_fetch *mf,
+                                           mem_fetch *mf, bool is_write,
                                            bool probe_mode) const {
   mem_access_sector_mask_t mask = mf->get_access_sector_mask();
-  return probe(addr, idx, mask, probe_mode, mf);
+  return probe(addr, idx, mask,is_write, probe_mode, mf);
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
                                            mem_access_sector_mask_t mask,
-                                           bool probe_mode,
+                                           bool is_write, bool probe_mode,
                                            mem_fetch *mf) const {
   // assert( m_config.m_write_policy == READ_ONLY );
   unsigned set_index = m_config.set_index(addr);
@@ -279,7 +279,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
         idx = index;
         return HIT;
       } else if (line->get_status(mask) == MODIFIED) {
-        if (line->is_readable(mask)) {
+        if ((!is_write && line->is_readable(mask)) || is_write) {
           idx = index;
           return HIT;
         } else {
@@ -363,7 +363,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   m_access++;
   is_used = true;
   shader_cache_access_log(m_core_id, m_type_id, 0);  // log accesses to cache
-  enum cache_request_status status = probe(addr, idx, mf);
+  enum cache_request_status status = probe(addr, idx, mf, mf->is_write());
   switch (status) {
     case HIT_RESERVED:
       m_pending_hit++;
@@ -414,16 +414,17 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   return status;
 }
 
-void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf) {
-  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write) {
+  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask(), is_write);
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time,
-                     mem_access_sector_mask_t mask, mem_access_byte_mask_t byte_mask) {
+                     mem_access_sector_mask_t mask, mem_access_byte_mask_t byte_mask,
+                     bool is_write) {
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
-  enum cache_request_status status = probe(addr, idx, mask);
-  bool before = false;
+  enum cache_request_status status = probe(addr, idx, mask,is_write);
+  bool before = m_lines[idx]->is_modified_line();
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
   if (status == MISS) {
@@ -1105,7 +1106,7 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   if (m_config.m_alloc_policy == ON_MISS)
     m_tag_array->fill(e->second.m_cache_index, time, mf);
   else if (m_config.m_alloc_policy == ON_FILL) {
-    m_tag_array->fill(e->second.m_block_addr, time, mf);
+    m_tag_array->fill(e->second.m_block_addr, time, mf, mf->is_write());
     if (m_config.is_streaming()) m_tag_array->remove_pending_line(mf);
   } else
     abort();
@@ -1659,7 +1660,7 @@ enum cache_request_status read_only_cache::access(
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status status =
-      m_tag_array->probe(block_addr, cache_index, mf);
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write());
   enum cache_request_status cache_status = RESERVATION_FAIL;
 
   if (status == HIT) {
@@ -1746,7 +1747,7 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status probe_status =
-      m_tag_array->probe(block_addr, cache_index, mf, true);
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);
   enum cache_request_status access_status =
       process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events);
   m_stats.inc_stats(mf->get_access_type(),
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index a84ddd18a..c2e302ead 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -914,9 +914,11 @@ class tag_array {
   ~tag_array();
 
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
-                                  mem_fetch *mf, bool probe_mode = false) const;
+                                  mem_fetch *mf, bool is_write,
+                                  bool probe_mode = false) const;
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
                                   mem_access_sector_mask_t mask,
+                                  bool is_write,
                                   bool probe_mode = false,
                                   mem_fetch *mf = NULL) const;
   enum cache_request_status access(new_addr_type addr, unsigned time,
@@ -925,10 +927,10 @@ class tag_array {
                                    unsigned &idx, bool &wb,
                                    evicted_block_info &evicted, mem_fetch *mf);
 
-  void fill(new_addr_type addr, unsigned time, mem_fetch *mf);
+  void fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
   void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask, 
-            mem_access_byte_mask_t byte_mask);
+            mem_access_byte_mask_t byte_mask, bool is_write);
 
   unsigned size() const { return m_config.get_num_lines(); }
   cache_block_t *get_block(unsigned idx) { return m_lines[idx]; }
@@ -1316,7 +1318,7 @@ class baseline_cache : public cache_t {
   void force_tag_access(new_addr_type addr, unsigned time,
                         mem_access_sector_mask_t mask) {
     mem_access_byte_mask_t byte_mask;
-    m_tag_array->fill(addr, time, mask, byte_mask);
+    m_tag_array->fill(addr, time, mask, byte_mask, true);
   }
 
  protected:

From 994fb19e160e3897b5662fb7e6946a3802fde794 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 4 May 2021 15:17:57 -0400
Subject: [PATCH 053/133] reset dirty counter

---
 src/gpgpu-sim/gpu-cache.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 9c65476b1..e88a64627 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -428,12 +428,10 @@ void tag_array::fill(new_addr_type addr, unsigned time,
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
   if (status == MISS) {
-    before = m_lines[idx]->is_modified_line();
     m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr), time,
                            mask);
   } else if (status == SECTOR_MISS) {
     assert(m_config.m_cache_type == SECTOR);
-    before = m_lines[idx]->is_modified_line();
     ((sector_cache_block *)m_lines[idx])->allocate_sector(time, mask);
   }
   if (before && !m_lines[idx]->is_modified_line()) {
@@ -458,10 +456,10 @@ void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
 // TODO: we need write back the flushed data to the upper level
 void tag_array::flush() {
   if (!is_used) return;
+  m_dirty = 0;
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
-      m_dirty--;
       for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
       }
@@ -472,6 +470,7 @@ void tag_array::flush() {
 
 void tag_array::invalidate() {
   if (!is_used) return;
+  m_dirty = 0;
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)

From 73069303b3dc0845e33b9ddafa7e6697fe3deb38 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 11 May 2021 22:45:44 -0400
Subject: [PATCH 054/133] remove runtime check of dirty counter

---
 src/gpgpu-sim/gpu-cache.cc | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index e88a64627..9e1db8bc0 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -251,22 +251,6 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
   unsigned long long valid_timestamp = (unsigned)-1;
 
   bool all_reserved = true;
-  unsigned count = 0;
-  if (m_config.m_wr_percent == (unsigned)25) {
-    for (unsigned i = 0; i < m_config.m_nset * m_config.m_assoc; i++) {
-      if (m_lines[i]->is_modified_line()) {
-        m_lines[i]->is_modified_line();
-        count++;
-      }
-    }
-    if (count != m_dirty) {
-      printf("count = %u, m_dirty = %u",count,m_dirty);
-      fflush(stdout);
-      assert(0 && "m_dirty miss match");
-      printf("count = %u, m_dirty = %u",count,m_dirty);
-
-    }
-  }
   // check for hit or pending hit
   for (unsigned way = 0; way < m_config.m_assoc; way++) {
     unsigned index = set_index * m_config.m_assoc + way;

From 0601354a4d7f7f106e008b47cbc74097ec0a2a69 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 18 May 2021 14:35:04 -0400
Subject: [PATCH 055/133] Add WT to lazy_fetch_on_read

---
 src/gpgpu-sim/gpu-cache.cc | 29 ++++++++++++++++++++++++++---
 src/gpgpu-sim/gpu-cache.h  |  3 +++
 src/gpgpu-sim/shader.cc    |  5 +++--
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 9e1db8bc0..390bacce2 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1494,16 +1494,39 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
     std::list<cache_event> &events, enum cache_request_status status) {
   new_addr_type block_addr = m_config.block_addr(addr);
+  new_addr_type mshr_addr = m_config.mshr_addr(mf->get_addr());
 
   // if the request writes to the whole cache line/sector, then, write and set
   // cache line Modified. and no need to send read request to memory or reserve
   // mshr
 
-  if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
-    return RESERVATION_FAIL;  // cannot handle request this cycle
+  // Write allocate, maximum 2 requests (write miss, write back request)
+  // Conservatively ensure the worst-case request can be handled this
+  // cycle
+  if (m_config.m_write_policy == WRITE_THROUGH) {
+    bool mshr_hit = m_mshrs.probe(mshr_addr);
+    bool mshr_avail = !m_mshrs.full(mshr_addr);
+    if (miss_queue_full(1) ||
+        (!(mshr_hit && mshr_avail) &&
+         !(!mshr_hit && mshr_avail &&
+           (m_miss_queue.size() < m_config.m_miss_queue_size)))) {
+      // check what is the exactly the failure reason
+      if (miss_queue_full(1))
+        m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+      else if (mshr_hit && !mshr_avail)
+        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
+      else if (!mshr_hit && !mshr_avail)
+        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
+      else
+        assert(0);
+
+      return RESERVATION_FAIL;
+    }
+
+    send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
   }
 
+
   bool wb = false;
   evicted_block_info evicted;
 
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index c2e302ead..6811b868e 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -821,6 +821,9 @@ class cache_config {
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
   }
+  write_policy_t get_write_policy() {
+    return m_write_policy;
+  }
 
  protected:
   void exit_parse_error() {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4b4c98db7..22bd8e9a9 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1989,9 +1989,10 @@ void ldst_unit::L1_latency_queue_cycle() {
       } else {
         assert(status == MISS || status == HIT_RESERVED);
         l1_latency_queue[j][0] = NULL;
-        if (mf_next->get_inst().is_store() &&
+        if (m_config->m_L1D_config.get_write_policy() != WRITE_THROUGH &&
+            mf_next->get_inst().is_store() &&
             (m_config->m_L1D_config.get_write_allocate_policy() == FETCH_ON_WRITE ||
-              m_config->m_L1D_config.get_write_allocate_policy() == LAZY_FETCH_ON_READ) &&
+            m_config->m_L1D_config.get_write_allocate_policy() == LAZY_FETCH_ON_READ) &&
             !was_writeallocate_sent(events)) {
           unsigned dec_ack =
               (m_config->m_L1D_config.get_mshr_type() == SECTOR_ASSOC)

From f7833519471ce92619bd1e4807ec07eb55aed76e Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 17 May 2021 17:35:06 -0400
Subject: [PATCH 056/133] new configs - adaptive cache and cache write ratio

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 ++
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 6 ++++++
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config   | 3 +++
 src/abstract_hardware_model.h                    | 2 ++
 src/gpgpu-sim/gpu-cache.h                        | 5 +++++
 src/gpgpu-sim/gpu-sim.cc                         | 7 +++++++
 6 files changed, 25 insertions(+)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 6189dca0f..e006085df 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -110,6 +110,8 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
 
 # 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index bc5677cf3..043fce64c 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -124,6 +124,9 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_cache_write_ratio 25
+-gpgpu_shmem_option 0,12,24,48,96
+-gpgpu_unified_l1d_size 128
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
@@ -203,3 +206,6 @@
 #-trace_components WARP_SCHEDULER,SCOREBOARD
 #-trace_sampling_core 0
 
+-gpgpu_cache_write_ratio 25
+-gpgpu_shmem_option 0,12,24,48,96
+-gpgpu_unified_l1d_size 128
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 3fa51ee14..1f0c15f51 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -125,6 +125,9 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_cache_write_ratio 25
+-gpgpu_shmem_option 0,12,24,48,96
+-gpgpu_unified_l1d_size 128
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 982e41606..e796571dc 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -373,6 +373,8 @@ class core_config {
   }
   unsigned mem_warp_parts;
   mutable unsigned gpgpu_shmem_size;
+  char *gpgpu_shmem_option;
+  unsigned gpgpu_unified_l1d_size;
   unsigned gpgpu_shmem_sizeDefault;
   unsigned gpgpu_shmem_sizePrefL1;
   unsigned gpgpu_shmem_sizePrefShared;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 00c09ae55..ccc935bae 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -491,6 +491,7 @@ class cache_config {
     m_data_port_width = 0;
     m_set_index_function = LINEAR_SET_FUNCTION;
     m_is_streaming = false;
+    m_wr_percent = 0;
   }
   void init(char *config, FuncCache status) {
     cache_status = status;
@@ -754,6 +755,10 @@ class cache_config {
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
+  unsigned m_wr_percent;
+  write_allocate_policy_t get_write_allocate_policy() {
+    return m_write_alloc_policy;
+  }
 
  protected:
   void exit_parse_error() {
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index fd36e006a..bd09cdbe5 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -249,6 +249,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
                          "none");
+  option_parser_register(opp,"-gpgpu_cache_write_ratio",OPT_UINT32,&m_L1D_config.m_wr_percent,"L1D write ratio","0");
   option_parser_register(opp, "-gpgpu_l1_banks", OPT_UINT32,
                          &m_L1D_config.l1_banks, "The number of L1 cache banks",
                          "1");
@@ -326,6 +327,12 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(
       opp, "-gpgpu_shmem_size", OPT_UINT32, &gpgpu_shmem_size,
       "Size of shared memory per shader core (default 16kB)", "16384");
+  option_parser_register(
+      opp, "-gpgpu_shmem_option", OPT_CSTR, &gpgpu_shmem_option,
+      "Option list of shared memory sizes", "0");
+  option_parser_register(
+      opp, "-gpgpu_unified_l1d_size", OPT_UINT32, &gpgpu_unified_l1d_size,
+      "Size of unified data cache(L1D + shared memory) in KB", "0");
   option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
   option_parser_register(

From a2b1b1c2839fe3fc05a0cae126204120fab00f62 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 17 May 2021 17:35:53 -0400
Subject: [PATCH 057/133] adaptive cache - update

---
 src/abstract_hardware_model.h |  2 +-
 src/gpgpu-sim/gpu-cache.h     | 11 ++++
 src/gpgpu-sim/shader.cc       | 95 +++++++++++++++++++++--------------
 3 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index e796571dc..bd10a93fe 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -65,7 +65,7 @@ enum FuncCache {
   FuncCachePreferL1 = 2
 };
 
-enum AdaptiveCache { FIXED = 0, ADAPTIVE_VOLTA = 1 };
+enum AdaptiveCache { FIXED = 0, ADAPTIVE_CACHE = 1 };
 
 #ifdef __cplusplus
 
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index ccc935bae..0162b6cbc 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -616,6 +616,8 @@ class cache_config {
     m_atom_sz = (m_cache_type == SECTOR) ? SECTOR_SIZE : m_line_sz;
     m_sector_sz_log2 = LOGB2(SECTOR_SIZE);
     original_m_assoc = m_assoc;
+    original_sz = m_nset * original_m_assoc * m_line_sz;
+
 
     // For more details about difference between FETCH_ON_WRITE and WRITE
     // VALIDAE policies Read: Jouppi, Norman P. "Cache write policies and
@@ -710,6 +712,14 @@ class cache_config {
     assert(m_valid);
     return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * original_m_assoc;
   }
+  unsigned get_original_assoc() const {
+    assert(m_valid);
+    return original_m_assoc;
+  }
+  unsigned get_original_sz() const {
+    assert(m_valid);
+    return original_sz;
+  }
   void print(FILE *fp) const {
     fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
             m_line_sz * m_nset * m_assoc, m_nset, m_assoc, m_line_sz);
@@ -777,6 +787,7 @@ class cache_config {
   unsigned m_atom_sz;
   unsigned m_sector_sz_log2;
   unsigned original_m_assoc;
+  unsigned original_sz;
   bool m_is_streaming;
 
   enum replacement_policy_t m_replacement_policy;  // 'L' = LRU, 'F' = FIFO
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 14d904424..b2adb4f53 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3292,50 +3292,67 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
   if (adaptive_cache_config && !k.cache_config_set) {
     // For more info about adaptive cache, see
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-    unsigned total_shmed = kernel_info->smem * result;
-    assert(total_shmed >= 0 && total_shmed <= gpgpu_shmem_size);
-    // assert(gpgpu_shmem_size == 98304); //Volta has 96 KB shared
-    // assert(m_L1D_config.get_nset() == 4);  //Volta L1 has four sets
-    if (total_shmed < gpgpu_shmem_size) {
-      switch (adaptive_cache_config) {
-        case FIXED:
-          break;
-        case ADAPTIVE_VOLTA: {
-          // For Volta, we assign the remaining shared memory to L1 cache
-          // For more info about adaptive cache, see
-          // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-          // assert(gpgpu_shmem_size == 98304); //Volta has 96 KB shared
-
-          // To Do: make it flexible and not tuned to 9KB share memory
-          unsigned max_assoc = m_L1D_config.get_max_assoc();
-          if (total_shmed == 0)
-            m_L1D_config.set_assoc(max_assoc);  // L1 is 128KB and shd=0
-          else if (total_shmed > 0 && total_shmed <= 8192)
-            m_L1D_config.set_assoc(0.9375 *
-                                   max_assoc);  // L1 is 120KB and shd=8KB
-          else if (total_shmed > 8192 && total_shmed <= 16384)
-            m_L1D_config.set_assoc(0.875 *
-                                   max_assoc);  // L1 is 112KB and shd=16KB
-          else if (total_shmed > 16384 && total_shmed <= 32768)
-            m_L1D_config.set_assoc(0.75 * max_assoc);  // L1 is 96KB and
-                                                       // shd=32KB
-          else if (total_shmed > 32768 && total_shmed <= 65536)
-            m_L1D_config.set_assoc(0.5 * max_assoc);  // L1 is 64KB and shd=64KB
-          else if (total_shmed > 65536 && total_shmed <= gpgpu_shmem_size)
-            m_L1D_config.set_assoc(0.25 * max_assoc);  // L1 is 32KB and
-                                                       // shd=96KB
-          else
-            assert(0);
-          break;
+    std::vector<unsigned> shmem_list;
+    for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
+      char option[4];
+      int j = 0;
+      while (gpgpu_shmem_option[i] != ',' && i < strlen(gpgpu_shmem_option)) {
+        if (gpgpu_shmem_option[i] == ' ') {
+          // skip spaces
+          i++;
+        } else {
+          if (!isdigit(gpgpu_shmem_option[i])) {
+            // check for non digits, which should not be here
+            assert(0 && "invalid config: -gpgpu_shmem_option");
+          }
+          option[j] = gpgpu_shmem_option[i];
+          j++;
+          i++;
         }
-        default:
-          assert(0);
       }
+      // convert KB -> B
+      shmem_list.push_back((unsigned)atoi(option) * 1024);
+    }
 
-      printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
-             m_L1D_config.get_total_size_inKB());
+    unsigned total_shmem = kernel_info->smem * result;
+    unsigned total_unified = gpgpu_unified_l1d_size * 1024;
+    std::sort(shmem_list.begin(), shmem_list.end());
+
+    assert(total_shmem >= 0 && total_shmem <= shmem_list.back());
+    switch (adaptive_cache_config) {
+      case FIXED:
+        break;
+      case ADAPTIVE_CACHE: {
+        // For more info about adaptive cache, see
+        bool l1d_configured = false;
+        unsigned l1_defined = m_L1D_config.get_original_sz() / 1024;
+        unsigned max_assoc = m_L1D_config.get_original_assoc() * 
+          gpgpu_unified_l1d_size / l1_defined;
+
+        if (total_shmem == 0) {
+          m_L1D_config.set_assoc(max_assoc);
+          l1d_configured = true;
+        } else {
+          for (std::vector<unsigned>::iterator it = shmem_list.begin();
+               it < shmem_list.end() - 1; it++) {
+            if (total_shmem > *it && total_shmem <= *(it + 1)) {
+              float l1_ratio = 1 - (float) *(it + 1) / total_unified;
+              m_L1D_config.set_assoc(max_assoc * l1_ratio);
+              l1d_configured = true;
+              break;
+            }
+          }
+        }
+        assert(l1d_configured && "no shared memory option found");
+        break;
+      }
+      default:
+        assert(0);
     }
 
+    printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
+           m_L1D_config.get_total_size_inKB());
+
     k.cache_config_set = true;
   }
 

From f70f5d6e5599c643074b0d00d3e3dcc385e5913d Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 19 May 2021 15:10:51 -0400
Subject: [PATCH 058/133] re-wording/formatting

---
 src/gpgpu-sim/gpu-cache.cc | 17 ++++++++---------
 src/gpgpu-sim/gpu-cache.h  |  6 +++---
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 390bacce2..05b338ea6 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -280,7 +280,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
     }
     if (!line->is_reserved_line()) {
       if (!line->is_modified_line() ||
-          100 * m_dirty / (m_config.m_nset * m_config.m_assoc) >=
+          m_dirty / (m_config.m_nset * m_config.m_assoc * 100) >=
               m_config.m_wr_percent) {
         all_reserved = false;
         if (line->is_invalid_line()) {
@@ -364,7 +364,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
                            m_lines[idx]->get_byte_mask(),
-                            m_lines[idx]->get_sector_mask());
+                            m_lines[idx]->get_dirty_sector_mask());
           m_dirty--;
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
@@ -430,17 +430,13 @@ void tag_array::fill(new_addr_type addr, unsigned time,
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  bool before = m_lines[index]->is_modified_line();
   m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
-  if (m_lines[index]->is_modified_line() && !before) {
-    m_dirty++;
-  }
+  m_dirty++;
 }
 
 // TODO: we need write back the flushed data to the upper level
 void tag_array::flush() {
   if (!is_used) return;
-  m_dirty = 0;
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
@@ -448,18 +444,19 @@ void tag_array::flush() {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
       }
     }
-
+  
+  m_dirty = 0;
   is_used = false;
 }
 
 void tag_array::invalidate() {
   if (!is_used) return;
-  m_dirty = 0;
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
       m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
 
+  m_dirty = 0;
   is_used = false;
 }
 
@@ -804,6 +801,8 @@ void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
               m_stats[type][status]);
 
       if (status != RESERVATION_FAIL && status != MSHR_HIT)
+      // MSHR_HIT is a special type of SECTOR_MISS
+      // so its already included in the SECTOR_MISS
         total_access[type] += m_stats[type][status];
     }
   }
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 6811b868e..51791735a 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -136,7 +136,7 @@ struct cache_block_t {
   virtual void set_byte_mask(mem_fetch *mf) = 0;
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) = 0;
   virtual mem_access_byte_mask_t get_byte_mask() = 0;
-  virtual mem_access_sector_mask_t get_sector_mask() = 0;
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
   virtual void set_last_access_time(unsigned long long time,
                                     mem_access_sector_mask_t sector_mask) = 0;
@@ -218,7 +218,7 @@ struct line_cache_block : public cache_block_t {
   virtual mem_access_byte_mask_t get_byte_mask() {
     return m_byte_mask;
   }
-  virtual mem_access_sector_mask_t get_sector_mask() {
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
     if (m_status == MODIFIED) sector_mask.set();
     return sector_mask;
@@ -413,7 +413,7 @@ struct sector_cache_block : public cache_block_t {
   virtual mem_access_byte_mask_t get_byte_mask() {
     return m_byte_mask;
   }
-  virtual mem_access_sector_mask_t get_sector_mask() {
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
       if (m_status[i] == MODIFIED) 

From 4a762a933a054b5124fa46a12789ea98f5e2411d Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 19 May 2021 15:22:31 -0400
Subject: [PATCH 059/133] formatting again

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 4 ++--
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 8 ++------
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config   | 2 +-
 src/gpgpu-sim/gpu-sim.cc                         | 2 +-
 src/gpgpu-sim/shader.cc                          | 1 +
 5 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index e006085df..d7573ab33 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -100,6 +100,8 @@
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 -gpgpu_adaptive_cache_config 0
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 65536
@@ -110,8 +112,6 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
--gpgpu_shmem_option 0,8,16,32,64,100
--gpgpu_unified_l1d_size 128
 
 # 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 043fce64c..59c7f43f7 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -124,7 +124,7 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
--gpgpu_cache_write_ratio 25
+-gpgpu_l1_cache_write_ratio 25
 -gpgpu_shmem_option 0,12,24,48,96
 -gpgpu_unified_l1d_size 128
 
@@ -204,8 +204,4 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
--gpgpu_cache_write_ratio 25
--gpgpu_shmem_option 0,12,24,48,96
--gpgpu_unified_l1d_size 128
\ No newline at end of file
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 1f0c15f51..3e080bcc5 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -125,7 +125,7 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
--gpgpu_cache_write_ratio 25
+-gpgpu_l1_cache_write_ratio 25
 -gpgpu_shmem_option 0,12,24,48,96
 -gpgpu_unified_l1d_size 128
 
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index bd09cdbe5..a2aa9293f 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -249,7 +249,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
                          "none");
-  option_parser_register(opp,"-gpgpu_cache_write_ratio",OPT_UINT32,&m_L1D_config.m_wr_percent,"L1D write ratio","0");
+  option_parser_register(opp,"-gpgpu_l1_cache_write_ratio",OPT_UINT32,&m_L1D_config.m_wr_percent,"L1D write ratio","0");
   option_parser_register(opp, "-gpgpu_l1_banks", OPT_UINT32,
                          &m_L1D_config.l1_banks, "The number of L1 cache banks",
                          "1");
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index b2adb4f53..141c700db 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3326,6 +3326,7 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         // For more info about adaptive cache, see
         bool l1d_configured = false;
         unsigned l1_defined = m_L1D_config.get_original_sz() / 1024;
+        assert(gpgpu_unified_l1d_size % l1_defined == 0);
         unsigned max_assoc = m_L1D_config.get_original_assoc() * 
           gpgpu_unified_l1d_size / l1_defined;
 

From 4c354ebda2c92bb5866c20f03a254743c8ec85a3 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 19 May 2021 15:45:35 -0400
Subject: [PATCH 060/133] minor improvements

---
 src/gpgpu-sim/gpu-cache.cc | 14 ++++++++------
 src/gpgpu-sim/gpu-cache.h  |  6 +++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 05b338ea6..98951cabb 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -279,17 +279,19 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       }
     }
     if (!line->is_reserved_line()) {
+      // percentage of dirty lines in the cache
+      // number of dirty lines / total lines in the cache
+      float dirty_line_percentage = 
+          (float) (m_dirty / (m_config.m_nset * m_config.m_assoc )) * 100;
       if (!line->is_modified_line() ||
-          m_dirty / (m_config.m_nset * m_config.m_assoc * 100) >=
-              m_config.m_wr_percent) {
+          dirty_line_percentage >= m_config.m_wr_percent) {
+        // if number of dirty lines in the cache is greater than
+        // a specific value
         all_reserved = false;
         if (line->is_invalid_line()) {
           invalid_line = index;
         } else {
           // valid line : keep track of most appropriate replacement candidate
-
-          // don't evict write until dirty lines reach threshold
-          // make sure at least 1 candidate is assigned
           if (m_config.m_replacement_policy == LRU) {
             if (line->get_last_access_time() < valid_timestamp) {
               valid_timestamp = line->get_last_access_time();
@@ -363,7 +365,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
           m_lines[idx]->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
-                           m_lines[idx]->get_byte_mask(),
+                           m_lines[idx]->get_dirty_byte_mask(),
                             m_lines[idx]->get_dirty_sector_mask());
           m_dirty--;
         }
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 51791735a..dc3b39a50 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -135,7 +135,7 @@ struct cache_block_t {
                           mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_byte_mask(mem_fetch *mf) = 0;
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) = 0;
-  virtual mem_access_byte_mask_t get_byte_mask() = 0;
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() = 0;
   virtual mem_access_sector_mask_t get_dirty_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
   virtual void set_last_access_time(unsigned long long time,
@@ -215,7 +215,7 @@ struct line_cache_block : public cache_block_t {
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
     m_byte_mask = m_byte_mask | byte_mask;
   }
-  virtual mem_access_byte_mask_t get_byte_mask() {
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() {
     return m_byte_mask;
   }
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {
@@ -410,7 +410,7 @@ struct sector_cache_block : public cache_block_t {
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
     m_byte_mask = m_byte_mask | byte_mask;
   }
-  virtual mem_access_byte_mask_t get_byte_mask() {
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() {
     return m_byte_mask;
   }
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {

From f27da224f3e468d600499a9d3619009ed9c70256 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 19 May 2021 17:27:43 -0400
Subject: [PATCH 061/133] Use cache config multipilier when possible

---
 src/abstract_hardware_model.h |  1 -
 src/gpgpu-sim/gpu-cache.h     | 28 +++++++++++++++-------------
 src/gpgpu-sim/gpu-sim.cc      |  2 +-
 src/gpgpu-sim/shader.cc       |  8 +++-----
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index bd10a93fe..dbe138a66 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -374,7 +374,6 @@ class core_config {
   unsigned mem_warp_parts;
   mutable unsigned gpgpu_shmem_size;
   char *gpgpu_shmem_option;
-  unsigned gpgpu_unified_l1d_size;
   unsigned gpgpu_shmem_sizeDefault;
   unsigned gpgpu_shmem_sizePrefL1;
   unsigned gpgpu_shmem_sizePrefShared;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 0162b6cbc..87a6b13e7 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -512,6 +512,14 @@ class cache_config {
       exit_parse_error();
     }
 
+    // set * assoc * cacheline size. Then convert Byte to KB
+    unsigned original_size = m_nset * m_assoc * m_line_sz / 1024;
+    if (m_unified_cache_size > 0) {
+      max_cache_multiplier = m_unified_cache_size / original_size;
+    } else {
+      max_cache_multiplier = MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+    }
+    
     switch (ct) {
       case 'N':
         m_cache_type = NORMAL;
@@ -588,7 +596,7 @@ class cache_config {
       // https://ieeexplore.ieee.org/document/8344474/
       m_is_streaming = true;
       m_alloc_policy = ON_FILL;
-      m_mshr_entries = m_nset * m_assoc * MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+      m_mshr_entries = m_nset * m_assoc * max_cache_multiplier;
       if (m_cache_type == SECTOR) m_mshr_entries *= SECTOR_CHUNCK_SIZE;
       m_mshr_max_merge = MAX_WARP_PER_SM;
     }
@@ -616,7 +624,6 @@ class cache_config {
     m_atom_sz = (m_cache_type == SECTOR) ? SECTOR_SIZE : m_line_sz;
     m_sector_sz_log2 = LOGB2(SECTOR_SIZE);
     original_m_assoc = m_assoc;
-    original_sz = m_nset * original_m_assoc * m_line_sz;
 
 
     // For more details about difference between FETCH_ON_WRITE and WRITE
@@ -706,19 +713,13 @@ class cache_config {
   }
   unsigned get_max_num_lines() const {
     assert(m_valid);
-    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * m_nset * original_m_assoc;
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
+    return max_cache_multiplier * m_nset * original_m_assoc;
   }
   unsigned get_max_assoc() const {
     assert(m_valid);
-    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * original_m_assoc;
-  }
-  unsigned get_original_assoc() const {
-    assert(m_valid);
-    return original_m_assoc;
-  }
-  unsigned get_original_sz() const {
-    assert(m_valid);
-    return original_sz;
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
+    return max_cache_multiplier * original_m_assoc;
   }
   void print(FILE *fp) const {
     fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
@@ -766,6 +767,7 @@ class cache_config {
   char *m_config_stringPrefShared;
   FuncCache cache_status;
   unsigned m_wr_percent;
+  unsigned m_unified_cache_size;
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
   }
@@ -787,8 +789,8 @@ class cache_config {
   unsigned m_atom_sz;
   unsigned m_sector_sz_log2;
   unsigned original_m_assoc;
-  unsigned original_sz;
   bool m_is_streaming;
+  unsigned max_cache_multiplier;
 
   enum replacement_policy_t m_replacement_policy;  // 'L' = LRU, 'F' = FIFO
   enum write_policy_t
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index a2aa9293f..df3004772 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -331,7 +331,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
       opp, "-gpgpu_shmem_option", OPT_CSTR, &gpgpu_shmem_option,
       "Option list of shared memory sizes", "0");
   option_parser_register(
-      opp, "-gpgpu_unified_l1d_size", OPT_UINT32, &gpgpu_unified_l1d_size,
+      opp, "-gpgpu_unified_l1d_size", OPT_UINT32, &m_L1D_config.m_unified_cache_size,
       "Size of unified data cache(L1D + shared memory) in KB", "0");
   option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 141c700db..3efef2b34 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3315,7 +3315,8 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
     }
 
     unsigned total_shmem = kernel_info->smem * result;
-    unsigned total_unified = gpgpu_unified_l1d_size * 1024;
+    // Unified cache config is in KB. Converting to B
+    unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
     std::sort(shmem_list.begin(), shmem_list.end());
 
     assert(total_shmem >= 0 && total_shmem <= shmem_list.back());
@@ -3325,10 +3326,7 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
       case ADAPTIVE_CACHE: {
         // For more info about adaptive cache, see
         bool l1d_configured = false;
-        unsigned l1_defined = m_L1D_config.get_original_sz() / 1024;
-        assert(gpgpu_unified_l1d_size % l1_defined == 0);
-        unsigned max_assoc = m_L1D_config.get_original_assoc() * 
-          gpgpu_unified_l1d_size / l1_defined;
+        unsigned max_assoc = m_L1D_config.get_max_assoc();
 
         if (total_shmem == 0) {
           m_L1D_config.set_assoc(max_assoc);

From 14f22bcdd171cdeb8d8f56f9ed02d6f711189be8 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 17:56:14 -0400
Subject: [PATCH 062/133] add checking on spec unit in subcore

---
 src/gpgpu-sim/shader.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 6229d1625..2513dde11 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -108,7 +108,7 @@ void shader_core_ctx::create_front_pipeline() {
 
   if (m_config->sub_core_model) {
     // in subcore model, each scheduler should has its own issue register, so
-    // num scheduler = reg width
+    // ensure num scheduler = reg width
     assert(m_config->gpgpu_num_sched_per_core ==
            m_pipeline_reg[ID_OC_SP].get_size());
     assert(m_config->gpgpu_num_sched_per_core ==
@@ -124,6 +124,11 @@ void shader_core_ctx::create_front_pipeline() {
     if (m_config->gpgpu_num_int_units > 0)
       assert(m_config->gpgpu_num_sched_per_core ==
              m_pipeline_reg[ID_OC_INT].get_size());
+    for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+      if (m_config->m_specialized_unit[j].num_units > 0)
+         assert(m_config->gpgpu_num_sched_per_core ==
+             m_config->m_specialized_unit[j].id_oc_spec_reg_width);
+    }
   }
 
   m_threadState = (thread_ctx_t *)calloc(sizeof(thread_ctx_t),

From 604baaf59255776b4714c0270ce36ad823d34df4 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 18:28:41 -0400
Subject: [PATCH 063/133] fixing the failing of merging

---
 src/gpgpu-sim/gpu-cache.h | 3 +--
 src/gpgpu-sim/shader.cc   | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 75dce40f4..d80152812 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -841,8 +841,8 @@ class cache_config {
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
-  unsigned m_wr_percent;
   unsigned m_unified_cache_size;
+  unsigned m_wr_percent;
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
   }
@@ -897,7 +897,6 @@ class cache_config {
   unsigned m_data_port_width;  //< number of byte the cache can access per cycle
   enum set_index_function
       m_set_index_function;  // Hash, linear, or custom set index function
-  unsigned m_wr_percent;
 
   friend class tag_array;
   friend class baseline_cache;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index db53fca7b..75fbe1646 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3391,13 +3391,12 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         assert(0);
     }
 
-<<<<<<< HEAD
     if(m_L1D_config.is_streaming()) {
       //for streaming cache, if the whole memory is allocated
       //to the L1 cache, then make the allocation to be on_MISS
       //otherwise, make it ON_FILL to eliminate line allocation fails
       //i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
-      if(total_shmed == 0) {
+      if(total_shmem == 0) {
         m_L1D_config.set_allocation_policy(ON_MISS);
         printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
       }
@@ -3406,10 +3405,8 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
       }
     }
-=======
     printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
            m_L1D_config.get_total_size_inKB());
->>>>>>> 2b2b6a2916e4ed833c707be887bf927167a71fa6
 
     k.cache_config_set = true;
   }

From a2ba2f57e8a24b9dd6ec6f2568accbbf439a9dca Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 19:39:48 -0400
Subject: [PATCH 064/133] updating config files with right adaptive cache
 parameters

---
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  | 18 ++++++++++--------
 configs/tested-cfgs/SM7_QV100/gpgpusim.config | 19 ++++++++++---------
 .../tested-cfgs/SM7_TITANV/gpgpusim.config    | 17 +++++++++--------
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  | 11 +++++++----
 4 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index d7573ab33..9e50fa305 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -99,19 +99,21 @@
 
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
--gpgpu_adaptive_cache_config 0
--gpgpu_shmem_option 0,8,16,32,64,100
--gpgpu_unified_l1d_size 128
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 32,64
+-gpgpu_unified_l1d_size 96
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_flush_l1_cache 1
+# shared memory configuration
 -gpgpu_shmem_size 65536
 -gpgpu_shmem_sizeDefault 65536
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 59c7f43f7..3750de09f 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -94,7 +94,7 @@
 -gpgpu_shmem_num_banks 32
 -gpgpu_shmem_limited_broadcast 0
 -gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 60
+-gpgpu_coalesce_arch 70
 
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
@@ -113,20 +113,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
--gpgpu_l1_cache_write_ratio 25
--gpgpu_shmem_option 0,12,24,48,96
--gpgpu_unified_l1d_size 128
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 32245d78a..e7f73059a 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -114,20 +114,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_l1_latency 20
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
--gpgpu_l1_cache_write_ratio 25
--gpgpu_shmem_option 0,12,24,48,96
--gpgpu_unified_l1d_size 128
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index f5418ad8e..3c0db06a8 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -107,17 +107,20 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-8-x
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_l1_latency 20
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_flush_l1_cache 1
+# shared memory configuration
 -gpgpu_shmem_size 102400
 -gpgpu_shmem_sizeDefault 102400
 -gpgpu_shmem_per_block 102400
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 3MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32

From b63d19a55c320b0bfd3ba4c80fe6f47a11bba39b Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 19:41:22 -0400
Subject: [PATCH 065/133] updating config files

---
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  |   1 +
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  |   1 +
 configs/tested-cfgs/TITAN_V/gpgpusim.config   | 173 ++++++++++++++++++
 configs/tested-cfgs/TITAN_V/trace.config      |  18 ++
 4 files changed, 193 insertions(+)
 create mode 100644 configs/tested-cfgs/TITAN_V/gpgpusim.config
 create mode 100644 configs/tested-cfgs/TITAN_V/trace.config

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 9e50fa305..856f5cffd 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -105,6 +105,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_n_cluster_ejection_buffer_size 32
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 3c0db06a8..9123e206f 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -112,6 +112,7 @@
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20
 -gpgpu_n_cluster_ejection_buffer_size 32
diff --git a/configs/tested-cfgs/TITAN_V/gpgpusim.config b/configs/tested-cfgs/TITAN_V/gpgpusim.config
new file mode 100644
index 000000000..8b5cb202f
--- /dev/null
+++ b/configs/tested-cfgs/TITAN_V/gpgpusim.config
@@ -0,0 +1,173 @@
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency  6745
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 24
+-gpgpu_n_sub_partition_per_mchannel 2
+
+# clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1200:1200:1200:850
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+-gpgpu_shader_core_pipeline 2048:32
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,8,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 21
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 35
+-ptx_opcode_initiation_tensor 32
+
+# sub core model: in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# register banks
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler gto
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1 S:4:128:64,L:L:m:N:L,A:512:64,16:0,32
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 49152
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 33
+-gpgpu_smem_latency 27
+-gpgpu_flush_l1_cache 1
+
+# L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 0
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 177
+-dram_latency 103
+
+# dram sched config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# dram model config
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=1:RRD=4:RCD=12:RAS=29:RP=12:RC=40:CL=12:WL=2:CDLR=3:WR=11:nbkgrp=4:CCDL=2:RTPL=4
+-dram_dual_bus_interface 1
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
diff --git a/configs/tested-cfgs/TITAN_V/trace.config b/configs/tested-cfgs/TITAN_V/trace.config
new file mode 100644
index 000000000..6e193f7bf
--- /dev/null
+++ b/configs/tested-cfgs/TITAN_V/trace.config
@@ -0,0 +1,18 @@
+-trace_opcode_latency_initiation_int 4,2
+-trace_opcode_latency_initiation_sp 4,2
+-trace_opcode_latency_initiation_dp 8,4
+-trace_opcode_latency_initiation_sfu 21,8
+-trace_opcode_latency_initiation_tensor 2,2
+
+#execute branch insts on spec unit 1
+#<enabled>,<num_units>,<max_latency>,<ID_OC_SPEC>,<OC_EX_SPEC>,<NAME>
+-specialized_unit_1 1,4,4,4,4,BRA
+-trace_opcode_latency_initiation_spec_op_1 4,4
+
+#TEX unit, make fixed latency for all tex insts
+-specialized_unit_2 1,4,200,4,4,TEX
+-trace_opcode_latency_initiation_spec_op_2 200,4
+
+#tensor unit
+-specialized_unit_3 1,4,2,4,4,TENSOR
+-trace_opcode_latency_initiation_spec_op_3 2,2

From e3d186bbeade78dec776989ccec2a0c0aea27fb4 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 19:43:29 -0400
Subject: [PATCH 066/133] chaning @sets to 4 based on recent ubenchs

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 2 +-
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config   | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 856f5cffd..a63d50fcb 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -104,7 +104,7 @@
 -gpgpu_unified_l1d_size 96
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 3750de09f..47bf1c898 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -117,7 +117,7 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index e7f73059a..3db64b3bc 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -118,7 +118,7 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 9123e206f..c70cfe8f3 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -111,7 +111,7 @@
 -gpgpu_unified_l1d_size 128
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20

From 24ffab25f41d76b94fd2012a8897312a73a7165f Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 21:17:40 -0400
Subject: [PATCH 067/133] moving shmem option to the base class and change the
 code to accept turing config

---
 src/abstract_hardware_model.h |  1 +
 src/gpgpu-sim/gpu-cache.h     |  3 +--
 src/gpgpu-sim/shader.cc       | 46 +++++++----------------------------
 src/gpgpu-sim/shader.h        | 26 ++++++++++++++++++++
 4 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 17a1cecb1..b33c50bd4 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -374,6 +374,7 @@ class core_config {
   unsigned mem_warp_parts;
   mutable unsigned gpgpu_shmem_size;
   char *gpgpu_shmem_option;
+  std::vector<unsigned> shmem_opt_list;
   unsigned gpgpu_shmem_sizeDefault;
   unsigned gpgpu_shmem_sizePrefL1;
   unsigned gpgpu_shmem_sizePrefShared;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index d80152812..26ed6211c 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -577,6 +577,7 @@ class cache_config {
     }
 
     // set * assoc * cacheline size. Then convert Byte to KB
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
     unsigned original_size = m_nset * m_assoc * m_line_sz / 1024;
     if (m_unified_cache_size > 0) {
       max_cache_multiplier = m_unified_cache_size / original_size;
@@ -785,12 +786,10 @@ class cache_config {
   }
   unsigned get_max_num_lines() const {
     assert(m_valid);
-    // gpgpu_unified_cache_size is in KB while original_sz is in B
     return max_cache_multiplier * m_nset * original_m_assoc;
   }
   unsigned get_max_assoc() const {
     assert(m_valid);
-    // gpgpu_unified_cache_size is in KB while original_sz is in B
     return max_cache_multiplier * original_m_assoc;
   }
   void print(FILE *fp) const {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 75fbe1646..bc747d676 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3334,56 +3334,28 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
   if (adaptive_cache_config && !k.cache_config_set) {
     // For more info about adaptive cache, see
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-    std::vector<unsigned> shmem_list;
-    for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
-      char option[4];
-      int j = 0;
-      while (gpgpu_shmem_option[i] != ',' && i < strlen(gpgpu_shmem_option)) {
-        if (gpgpu_shmem_option[i] == ' ') {
-          // skip spaces
-          i++;
-        } else {
-          if (!isdigit(gpgpu_shmem_option[i])) {
-            // check for non digits, which should not be here
-            assert(0 && "invalid config: -gpgpu_shmem_option");
-          }
-          option[j] = gpgpu_shmem_option[i];
-          j++;
-          i++;
-        }
-      }
-      // convert KB -> B
-      shmem_list.push_back((unsigned)atoi(option) * 1024);
-    }
-
     unsigned total_shmem = kernel_info->smem * result;
     // Unified cache config is in KB. Converting to B
     unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
-    std::sort(shmem_list.begin(), shmem_list.end());
 
-    assert(total_shmem >= 0 && total_shmem <= shmem_list.back());
+    assert(total_shmem >= 0 && total_shmem <= shmem_opt_list.back());
     switch (adaptive_cache_config) {
       case FIXED:
         break;
       case ADAPTIVE_CACHE: {
-        // For more info about adaptive cache, see
         bool l1d_configured = false;
         unsigned max_assoc = m_L1D_config.get_max_assoc();
 
-        if (total_shmem == 0) {
-          m_L1D_config.set_assoc(max_assoc);
-          l1d_configured = true;
-        } else {
-          for (std::vector<unsigned>::iterator it = shmem_list.begin();
-               it < shmem_list.end() - 1; it++) {
-            if (total_shmem > *it && total_shmem <= *(it + 1)) {
-              float l1_ratio = 1 - (float) *(it + 1) / total_unified;
-              m_L1D_config.set_assoc(max_assoc * l1_ratio);
-              l1d_configured = true;
-              break;
-            }
+        for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
+              it < shmem_opt_list.end(); it++) {
+          if (total_shmem <= *it) {
+            float l1_ratio = 1 - ((float) *(it) / total_unified);
+            m_L1D_config.set_assoc(max_assoc * l1_ratio);
+            l1d_configured = true;
+            break;
           }
         }
+        
         assert(l1d_configured && "no shared memory option found");
         break;
       }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index a7a2c02d6..42bbdcb99 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1495,6 +1495,32 @@ class shader_core_config : public core_config {
       } else
         break;  // we only accept continuous specialized_units, i.e., 1,2,3,4
     }
+
+    //parse gpgpu_shmem_option for adpative cache config
+    if(adaptive_cache_config) {
+      for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
+        char option[4];
+        int j = 0;
+        while (gpgpu_shmem_option[i] != ',' && i < strlen(gpgpu_shmem_option)) {
+          if (gpgpu_shmem_option[i] == ' ') {
+            // skip spaces
+            i++;
+          } else {
+            if (!isdigit(gpgpu_shmem_option[i])) {
+              // check for non digits, which should not be here
+              assert(0 && "invalid config: -gpgpu_shmem_option");
+            }
+            option[j] = gpgpu_shmem_option[i];
+            j++;
+            i++;
+          }
+        }
+        // convert KB -> B
+        shmem_opt_list.push_back((unsigned)atoi(option) * 1024);
+      }
+      std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
+    }
+
   }
   void reg_options(class OptionParser *opp);
   unsigned max_cta(const kernel_info_t &k) const;

From fedcde3789f7921647caee184c0fa104403c848d Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 21:42:29 -0400
Subject: [PATCH 068/133] moving the unified size from the base class config to
 l1 config

---
 src/gpgpu-sim/gpu-cache.h | 30 ++++++++++++++++--------------
 src/gpgpu-sim/shader.cc   |  3 ++-
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 26ed6211c..8bd62da39 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -575,15 +575,6 @@ class cache_config {
       }
       exit_parse_error();
     }
-
-    // set * assoc * cacheline size. Then convert Byte to KB
-    // gpgpu_unified_cache_size is in KB while original_sz is in B
-    unsigned original_size = m_nset * m_assoc * m_line_sz / 1024;
-    if (m_unified_cache_size > 0) {
-      max_cache_multiplier = m_unified_cache_size / original_size;
-    } else {
-      max_cache_multiplier = MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
-    }
     
     switch (ct) {
       case 'N':
@@ -694,7 +685,6 @@ class cache_config {
     m_sector_sz_log2 = LOGB2(SECTOR_SIZE);
     original_m_assoc = m_assoc;
 
-
     // For more details about difference between FETCH_ON_WRITE and WRITE
     // VALIDAE policies Read: Jouppi, Norman P. "Cache write policies and
     // performance". ISCA 93. WRITE_ALLOCATE is the old write policy in
@@ -786,11 +776,11 @@ class cache_config {
   }
   unsigned get_max_num_lines() const {
     assert(m_valid);
-    return max_cache_multiplier * m_nset * original_m_assoc;
+    return get_max_cache_multiplier() * m_nset * original_m_assoc;
   }
   unsigned get_max_assoc() const {
     assert(m_valid);
-    return max_cache_multiplier * original_m_assoc;
+    return get_max_cache_multiplier() * original_m_assoc;
   }
   void print(FILE *fp) const {
     fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
@@ -799,6 +789,8 @@ class cache_config {
 
   virtual unsigned set_index(new_addr_type addr) const;
 
+  virtual unsigned get_max_cache_multiplier() const { return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;}
+
   unsigned hash_function(new_addr_type addr, unsigned m_nset,
                          unsigned m_line_sz_log2, unsigned m_nset_log2,
                          unsigned m_index_function) const;
@@ -840,7 +832,6 @@ class cache_config {
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
-  unsigned m_unified_cache_size;
   unsigned m_wr_percent;
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
@@ -867,7 +858,6 @@ class cache_config {
   unsigned m_sector_sz_log2;
   unsigned original_m_assoc;
   bool m_is_streaming;
-  unsigned max_cache_multiplier;
 
   enum replacement_policy_t m_replacement_policy;  // 'L' = LRU, 'F' = FIFO
   enum write_policy_t
@@ -922,6 +912,18 @@ class l1d_cache_config : public cache_config {
   unsigned l1_banks_byte_interleaving;
   unsigned l1_banks_byte_interleaving_log2;
   unsigned l1_banks_hashing_function;
+  unsigned m_unified_cache_size;
+  virtual unsigned get_max_cache_multiplier() const { 
+      // set * assoc * cacheline size. Then convert Byte to KB
+      // gpgpu_unified_cache_size is in KB while original_sz is in B
+      if (m_unified_cache_size > 0) {
+        unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
+        assert(m_unified_cache_size % original_size == 0);
+        return m_unified_cache_size / original_size;
+      } else {
+        return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+      }    
+    }
 };
 
 class l2_cache_config : public cache_config {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index bc747d676..7f27b7b64 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3335,10 +3335,11 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
     // For more info about adaptive cache, see
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
     unsigned total_shmem = kernel_info->smem * result;
+    assert(total_shmem >= 0 && total_shmem <= shmem_opt_list.back());
+
     // Unified cache config is in KB. Converting to B
     unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
 
-    assert(total_shmem >= 0 && total_shmem <= shmem_opt_list.back());
     switch (adaptive_cache_config) {
       case FIXED:
         break;

From 8aee56d7401af9a91a1de3adae1b61329e0d30e5 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 22:10:53 -0400
Subject: [PATCH 069/133] rename set_dirty_byte_mask

---
 src/gpgpu-sim/gpu-cache.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 8bd62da39..91cde7e8f 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -210,13 +210,13 @@ struct line_cache_block : public cache_block_t {
     m_status = status;
   }
   virtual void set_byte_mask(mem_fetch *mf) {
-    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
+    m_dirty_byte_mask = m_dirty_byte_mask | mf->get_access_byte_mask();
   }
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
-    m_byte_mask = m_byte_mask | byte_mask;
+    m_dirty_byte_mask = m_dirty_byte_mask | byte_mask;
   }
   virtual mem_access_byte_mask_t get_dirty_byte_mask() {
-    return m_byte_mask;
+    return m_dirty_byte_mask;
   }
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
@@ -270,7 +270,7 @@ struct line_cache_block : public cache_block_t {
   bool m_set_readable_on_fill;
   bool m_set_byte_mask_on_fill;
   bool m_readable;
-  mem_access_byte_mask_t m_byte_mask;
+  mem_access_byte_mask_t m_dirty_byte_mask;
 };
 
 struct sector_cache_block : public cache_block_t {
@@ -290,7 +290,7 @@ struct sector_cache_block : public cache_block_t {
     m_line_alloc_time = 0;
     m_line_last_access_time = 0;
     m_line_fill_time = 0;
-    m_byte_mask.reset();
+    m_dirty_byte_mask.reset();
   }
 
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
@@ -405,13 +405,13 @@ struct sector_cache_block : public cache_block_t {
   }
 
   virtual void set_byte_mask(mem_fetch *mf) {
-    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
+    m_dirty_byte_mask = m_dirty_byte_mask | mf->get_access_byte_mask();
   }
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
-    m_byte_mask = m_byte_mask | byte_mask;
+    m_dirty_byte_mask = m_dirty_byte_mask | byte_mask;
   }
   virtual mem_access_byte_mask_t get_dirty_byte_mask() {
-    return m_byte_mask;
+    return m_dirty_byte_mask;
   }
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
@@ -492,7 +492,7 @@ struct sector_cache_block : public cache_block_t {
   bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_set_byte_mask_on_fill;
   bool m_readable[SECTOR_CHUNCK_SIZE];
-  mem_access_byte_mask_t m_byte_mask;
+  mem_access_byte_mask_t m_dirty_byte_mask;
 
   unsigned get_sector_index(mem_access_sector_mask_t sector_mask) {
     assert(sector_mask.count() == 1);

From b466afea67e6d6faf49f01ecfe378257fbdb93af Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 22:20:04 -0400
Subject: [PATCH 070/133] eliminate redundant code in gpu-cache.h

---
 src/gpgpu-sim/gpu-cache.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 91cde7e8f..6698d9286 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -596,16 +596,6 @@ class cache_config {
       default:
         exit_parse_error();
     }
-    switch (rp) {
-      case 'L':
-        m_replacement_policy = LRU;
-        break;
-      case 'F':
-        m_replacement_policy = FIFO;
-        break;
-      default:
-        exit_parse_error();
-    }
     switch (wp) {
       case 'R':
         m_write_policy = READ_ONLY;

From 7fac247e3e1c4326081c3ea4d46da6c5dc83ccb9 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 22:20:56 -0400
Subject: [PATCH 071/133] change L1 cache config in Volta+ to be write-through
 and write-allocate based on recent ubench

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 2 +-
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config   | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 3 +--
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index a63d50fcb..f715f3aa4 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -104,7 +104,7 @@
 -gpgpu_unified_l1d_size 96
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 47bf1c898..5f22a42b0 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -117,7 +117,7 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 3db64b3bc..c44563fb6 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -118,7 +118,7 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index c70cfe8f3..02cdb9ec7 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -101,7 +101,6 @@
 ## L1/shared memory configuration
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
-# Default config is 28KB DL1 and 100KB shared memory
 # In Ampere, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-8-x
@@ -111,7 +110,7 @@
 -gpgpu_unified_l1d_size 128
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20

From 0d33266ff6ca9b880dff40f6338c8a5cae696438 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 22:25:37 -0400
Subject: [PATCH 072/133] oops delete this config, it should not be pushed

---
 configs/tested-cfgs/TITAN_V/gpgpusim.config | 173 --------------------
 configs/tested-cfgs/TITAN_V/trace.config    |  18 --
 2 files changed, 191 deletions(-)
 delete mode 100644 configs/tested-cfgs/TITAN_V/gpgpusim.config
 delete mode 100644 configs/tested-cfgs/TITAN_V/trace.config

diff --git a/configs/tested-cfgs/TITAN_V/gpgpusim.config b/configs/tested-cfgs/TITAN_V/gpgpusim.config
deleted file mode 100644
index 8b5cb202f..000000000
--- a/configs/tested-cfgs/TITAN_V/gpgpusim.config
+++ /dev/null
@@ -1,173 +0,0 @@
-# functional simulator specification
--gpgpu_ptx_instruction_classification 0
--gpgpu_ptx_sim_mode 0
--gpgpu_ptx_force_max_capability 70
-
-# Device Limits
--gpgpu_stack_size_limit 1024
--gpgpu_heap_size_limit 8388608
--gpgpu_runtime_sync_depth_limit 2
--gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency  6745
--gpgpu_TB_launch_latency 0
-
-# Compute Capability
--gpgpu_compute_capability_major 7
--gpgpu_compute_capability_minor 0
-
-# PTX execution-driven
--gpgpu_ptx_convert_to_ptxplus 0
--gpgpu_ptx_save_converted_ptxplus 0
-
-# high level architecture configuration
--gpgpu_n_clusters 80
--gpgpu_n_cores_per_cluster 1
--gpgpu_n_mem 24
--gpgpu_n_sub_partition_per_mchannel 2
-
-# clock domains
-#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1200:1200:1200:850
-
-# shader core pipeline config
--gpgpu_shader_registers 65536
--gpgpu_registers_per_block 65536
--gpgpu_occupancy_sm_number 70
-
--gpgpu_shader_core_pipeline 2048:32
--gpgpu_shader_cta 32
--gpgpu_simd_model 1
-
-# Pipeline widths and number of FUs
-# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
--gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
--gpgpu_num_sp_units 4
--gpgpu_num_sfu_units 4
--gpgpu_num_dp_units 4
--gpgpu_num_int_units 4
--gpgpu_tensor_core_avail 1
--gpgpu_num_tensor_core_units 4
-
-# Instruction latencies and initiation intervals
-# "ADD,MAX,MUL,MAD,DIV"
-# All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,4,4,4,21
--ptx_opcode_initiation_int 2,2,2,2,2
--ptx_opcode_latency_fp 4,4,4,4,39
--ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,8,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 21
--ptx_opcode_initiation_sfu 8
--ptx_opcode_latency_tesnor 35
--ptx_opcode_initiation_tensor 32
-
-# sub core model: in which each scheduler has its own register file and EUs
-# i.e. schedulers are isolated
--gpgpu_sub_core_model 1
-# disable specialized operand collectors and use generic operand collectors instead
--gpgpu_enable_specialized_operand_collector 0
--gpgpu_operand_collector_num_units_gen 8
--gpgpu_operand_collector_num_in_ports_gen 8
--gpgpu_operand_collector_num_out_ports_gen 8
-# register banks
--gpgpu_num_reg_banks 16
--gpgpu_reg_file_port_throughput 2
-
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 70
-
-# warp scheduling
--gpgpu_num_sched_per_core 4
--gpgpu_scheduler gto
-# a warp scheduler issue mode
--gpgpu_max_insn_issue_per_warp 1
--gpgpu_dual_issue_diff_exec_units 1
-
-## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
-# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
--gpgpu_adaptive_cache_config 1
--gpgpu_l1_banks 4
--gpgpu_cache:dl1 S:4:128:64,L:L:m:N:L,A:512:64,16:0,32
--gpgpu_shmem_size 98304
--gpgpu_shmem_sizeDefault 98304
--gpgpu_shmem_per_block 49152
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 33
--gpgpu_smem_latency 27
--gpgpu_flush_l1_cache 1
-
-# L2 cache
--gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dl2_texture_only 0
--gpgpu_dram_partition_queues 64:64:64:64
--gpgpu_perf_sim_memcpy 1
--gpgpu_memory_partition_indexing 0
-
-# 128 KB Inst.
--gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
--gpgpu_inst_fetch_throughput 4
-# 128 KB Tex
-# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
--gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
-# 64 KB Const
--gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
--gpgpu_perfect_inst_const_cache 1
-
-# interconnection
-# use built-in local xbar
--network_mode 2
--icnt_in_buffer_limit 512
--icnt_out_buffer_limit 512
--icnt_subnets 2
--icnt_flit_size 40
--icnt_arbiter_algo 1
-
-# memory partition latency config 
--gpgpu_l2_rop_latency 177
--dram_latency 103
-
-# dram sched config
--gpgpu_dram_scheduler 1
--gpgpu_frfcfs_dram_sched_queue_size 64
--gpgpu_dram_return_queue_size 192
-
-# dram model config
--gpgpu_n_mem_per_ctrlr 1
--gpgpu_dram_buswidth 16
--gpgpu_dram_burst_length 2
--dram_data_command_freq_ratio 2
--gpgpu_mem_address_mask 1
--gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
-
-# Mem timing 
--gpgpu_dram_timing_opt nbk=16:CCD=1:RRD=4:RCD=12:RAS=29:RP=12:RC=40:CL=12:WL=2:CDLR=3:WR=11:nbkgrp=4:CCDL=2:RTPL=4
--dram_dual_bus_interface 1
-
-# select lower bits for bnkgrp to increase bnkgrp parallelism
--dram_bnk_indexing_policy 0
--dram_bnkgrp_indexing_policy 1
-
-#-dram_seperate_write_queue_enable 1
-#-dram_write_queue_size 64:56:32
-
-# stat collection
--gpgpu_memlatency_stat 14 
--gpgpu_runtime_stat 500
--enable_ptx_file_line_stats 1
--visualizer_enabled 0
-
-# power model configs, disable it untill we create a real energy model
--power_simulation_enabled 0
-
-# tracing functionality
-#-trace_enabled 1
-#-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
diff --git a/configs/tested-cfgs/TITAN_V/trace.config b/configs/tested-cfgs/TITAN_V/trace.config
deleted file mode 100644
index 6e193f7bf..000000000
--- a/configs/tested-cfgs/TITAN_V/trace.config
+++ /dev/null
@@ -1,18 +0,0 @@
--trace_opcode_latency_initiation_int 4,2
--trace_opcode_latency_initiation_sp 4,2
--trace_opcode_latency_initiation_dp 8,4
--trace_opcode_latency_initiation_sfu 21,8
--trace_opcode_latency_initiation_tensor 2,2
-
-#execute branch insts on spec unit 1
-#<enabled>,<num_units>,<max_latency>,<ID_OC_SPEC>,<OC_EX_SPEC>,<NAME>
--specialized_unit_1 1,4,4,4,4,BRA
--trace_opcode_latency_initiation_spec_op_1 4,4
-
-#TEX unit, make fixed latency for all tex insts
--specialized_unit_2 1,4,200,4,4,TEX
--trace_opcode_latency_initiation_spec_op_2 200,4
-
-#tensor unit
--specialized_unit_3 1,4,2,4,4,TENSOR
--trace_opcode_latency_initiation_spec_op_3 2,2

From c8eca04403d3acaff413788e342fd6aadd122948 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 17 May 2021 17:35:06 -0400
Subject: [PATCH 073/133] fix merge conflict

---
 src/gpgpu-sim/gpu-cache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 6698d9286..007403f5a 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -563,10 +563,10 @@ class cache_config {
     char ct, rp, wp, ap, mshr_type, wap, sif;
 
     int ntok =
-        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u,%u", &ct,
+        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", &ct,
                &m_nset, &m_line_sz, &m_assoc, &rp, &wp, &ap, &wap, &sif,
                &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
-               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width, &m_wr_percent);
+               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width);
 
     if (ntok < 12) {
       if (!strcmp(config, "none")) {

From f665ad5a49620b47118cbf6d578b469155e2a500 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 20 May 2021 20:52:06 -0400
Subject: [PATCH 074/133] L2 breakdown - reuse mf allocator

---
 src/abstract_hardware_model.h |  4 ++-
 src/gpgpu-sim/gpu-cache.cc    | 10 +++----
 src/gpgpu-sim/l2cache.cc      | 56 +++++++++++++----------------------
 src/gpgpu-sim/l2cache.h       |  4 ++-
 src/gpgpu-sim/shader.cc       | 19 ++++++------
 src/gpgpu-sim/shader.h        |  4 ++-
 6 files changed, 45 insertions(+), 52 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index b33c50bd4..60d7328e7 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -876,7 +876,9 @@ class mem_fetch_allocator {
                             const mem_access_byte_mask_t &byte_mask,
                             const mem_access_sector_mask_t &sector_mask,
                             unsigned size, bool wr,
-                            unsigned long long cycle) const = 0;                    
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const = 0;                    
 };
 
 // the maximum number of destination, source, or address uarch operands in a
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 297a94c08..23c5592d0 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1338,7 +1338,7 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1391,7 +1391,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1464,7 +1464,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1549,7 +1549,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1631,7 +1631,7 @@ enum cache_request_status data_cache::rd_miss_base(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 00b14d7f6..0db6bd44c 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -57,18 +57,19 @@ mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
   return mf;
 }
 
-mem_fetch *partition_mf_allocator::alloc(new_addr_type addr, 
-                            mem_access_type type,
+mem_fetch *partition_mf_allocator::alloc(new_addr_type addr, mem_access_type type,
                             const active_mask_t &active_mask,
                             const mem_access_byte_mask_t &byte_mask,
                             const mem_access_sector_mask_t &sector_mask,
                             unsigned size, bool wr,
-                            unsigned long long cycle) const {
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const {
   mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
                         sector_mask, m_memory_config->gpgpu_ctx);
   mem_fetch *mf =
-    new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
-                  -1, -1, m_memory_config, cycle);
+    new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
+                  sid, tpc, m_memory_config, cycle,original_mf);
     return mf;
 }
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
@@ -724,16 +725,11 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
         mask.set(k);
       }
-      const mem_access_t *ma = new mem_access_t(
-          mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i, SECTOR_SIZE,
-          mf->is_write(), mf->get_access_warp_mask(),
-          mf->get_access_byte_mask() & mask,
-          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
-
-      mem_fetch *n_mf =
-          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr() + SECTOR_SIZE * i, 
+        mf->get_access_type(),mf->get_access_warp_mask(), 
+        mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
+        SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+        mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
 
       result.push_back(n_mf);
     }
@@ -750,16 +746,11 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
         mask.set(k);
       }
-      const mem_access_t *ma = new mem_access_t(
-          mf->get_access_type(), mf->get_addr(), SECTOR_SIZE,
-          mf->is_write(), mf->get_access_warp_mask(),
-          mf->get_access_byte_mask() & mask,
-          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
-
-      mem_fetch *n_mf =
-          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr(), 
+        mf->get_access_type(),mf->get_access_warp_mask(), 
+        mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
+        SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+        mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
 
       result.push_back(n_mf);
     }
@@ -770,16 +761,11 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
         for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
           mask.set(k);
         }
-        const mem_access_t *ma = new mem_access_t(
-            mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i,
-            SECTOR_SIZE, mf->is_write(), mf->get_access_warp_mask(),
-            mf->get_access_byte_mask() & mask,
-            std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
-
-        mem_fetch *n_mf =
-            new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                          mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+        mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr() + SECTOR_SIZE * i, 
+          mf->get_access_type(),mf->get_access_warp_mask(), 
+          mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
+          SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+          mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
 
         result.push_back(n_mf);
       }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 1f5d7c468..59432b88d 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -56,7 +56,9 @@ class partition_mf_allocator : public mem_fetch_allocator {
                             const mem_access_byte_mask_t &byte_mask,
                             const mem_access_sector_mask_t &sector_mask,
                             unsigned size, bool wr,
-                            unsigned long long cycle) const;
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const;
 
  private:
   const memory_config *m_memory_config;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 7f27b7b64..51366deb4 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -62,18 +62,19 @@ mem_fetch *shader_core_mem_fetch_allocator::alloc(
   return mf;
 }
 
-mem_fetch *shader_core_mem_fetch_allocator::alloc(
-  new_addr_type addr, mem_access_type type,
-  const active_mask_t &active_mask,
-  const mem_access_byte_mask_t &byte_mask,
-  const mem_access_sector_mask_t &sector_mask,
-  unsigned size, bool wr,
-  unsigned long long cycle) const {
+mem_fetch *shader_core_mem_fetch_allocator::alloc(new_addr_type addr, mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const {
     mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
                           sector_mask, m_memory_config->gpgpu_ctx);
     mem_fetch *mf =
-      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
-                    m_core_id, m_cluster_id, m_memory_config, cycle);
+      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
+                    m_core_id, m_cluster_id, m_memory_config, cycle,original_mf);
       return mf;
   }
 /////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 42bbdcb99..866231357 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1903,7 +1903,9 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
                             const mem_access_byte_mask_t &byte_mask,
                             const mem_access_sector_mask_t &sector_mask,
                             unsigned size, bool wr,
-                            unsigned long long cycle) const;
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;

From b814c52fe9c4538669d845c5f05b247348f6fd1d Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Fri, 21 May 2021 15:12:43 -0400
Subject: [PATCH 075/133] cast to float - dirty line percentage

---
 src/gpgpu-sim/gpu-cache.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 23c5592d0..7e7d2adc4 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -282,7 +282,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       // percentage of dirty lines in the cache
       // number of dirty lines / total lines in the cache
       float dirty_line_percentage = 
-          (float) (m_dirty / (m_config.m_nset * m_config.m_assoc )) * 100;
+          ((float) m_dirty / (m_config.m_nset * m_config.m_assoc )) * 100;
       if (!line->is_modified_line() ||
           dirty_line_percentage >= m_config.m_wr_percent) {
         // if number of dirty lines in the cache is greater than

From 3b75d8f22694e6a8743793e5bc07779f518650b9 Mon Sep 17 00:00:00 2001
From: mkhairy <khairy2011@gmail.com>
Date: Sat, 22 May 2021 09:04:13 -0400
Subject: [PATCH 076/133] Update version

---
 version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version b/version
index 1a1a990cd..c832e567c 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.0.0 ";
+const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.1.0 ";

From 7e48560639e453fa2e4d86c99bec08f4a43bd884 Mon Sep 17 00:00:00 2001
From: mkhairy <khairy2011@gmail.com>
Date: Sat, 22 May 2021 09:08:05 -0400
Subject: [PATCH 077/133] Update CHANGES

---
 CHANGES | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CHANGES b/CHANGES
index 0c48a3dc0..7964153c0 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,16 @@
 LOG:
+Version 4.1.0 versus 4.0.0
+-Features:
+1- Supporting L1 write-allocate with sub-sector writing policy as in Volta+ hardware, and changing the Volta+ cards config to make L1 write-allocate with write-through
+2- Making the L1 adaptive cache policy to be configurable 
+3- Adding Ampere RTX 3060 config files
+-Bugs:
+1- Fixing L1 bank hash function bug
+2- Fixing L1 read hit counters in gpgpu-sim to match nvprof, to achieve more accurate L1 correlation with the HW
+3- Fixing bugs in lazy write handling, thanks to Gwendolyn Voskuilen from Sandia labs for this fix
+4- Fixing the backend pipeline for sub_core model 
+5- Fixing Memory stomp bug at the shader_config
+6- Some code refactoring:
 Version 4.0.0 (development branch) versus 3.2.3
 -Front-End:
 1- Support .nc cache modifier and __ldg function to access the read-only L1D cache

From b6409b4605dac8e39ea22ea6977a28c31177e44a Mon Sep 17 00:00:00 2001
From: mkhairy <khairy2011@gmail.com>
Date: Sat, 22 May 2021 09:34:34 -0400
Subject: [PATCH 078/133] Update README.md

---
 README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9f9f6698f..9bb891659 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,11 @@ This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
 Please see the copyright notice in the file COPYRIGHT distributed with this
 release in the same directory as this file.
 
+GPGPU-Sim 4.0 is compatible with Accel-Sim simulation framework. With the support 
+of Accel-Sim, GPGPU-Sim 4.0 can run NVIDIA SASS traces (trace-based simulation) 
+generated by NVIDIA's dynamic binary instrumentation tool (NVBit). For more information 
+about Accel-Sim, see [https://accel-sim.github.io/](https://accel-sim.github.io/)
+
 If you use GPGPU-Sim 4.0 in your research, please cite:
 
 Mahmoud Khairy, Zhesheng Shen, Tor M. Aamodt, Timothy G Rogers.
@@ -18,7 +23,7 @@ Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling.
 In proceedings of the 47th IEEE/ACM International Symposium on Computer Architecture (ISCA),
 May 29 - June 3, 2020.
 
-If you use CuDNN or PyTorch support, checkpointing or our new debugging tool for functional 
+If you use CuDNN or PyTorch support (execution-driven simulation), checkpointing or our new debugging tool for functional 
 simulation errors in GPGPU-Sim for your research, please cite:
 
 Jonathan Lew, Deval Shah, Suchita Pati, Shaylin Cattell, Mengchi Zhang, Amruth Sandhupatla, 
@@ -26,7 +31,6 @@ Christopher Ng, Negar Goli, Matthew D. Sinclair, Timothy G. Rogers, Tor M. Aamod
 Analyzing Machine Learning Workloads Using a Detailed GPU Simulator, arXiv:1811.08933,
 https://arxiv.org/abs/1811.08933
 
-
 If you use the Tensor Core model in GPGPU-Sim or GPGPU-Sim's CUTLASS Library 
 for your research please cite:
 
@@ -261,6 +265,7 @@ To clean the docs run
 The documentation resides at doc/doxygen/html.
 
 To run Pytorch applications with the simulator, install the modified Pytorch library as well by following instructions [here](https://github.com/gpgpu-sim/pytorch-gpgpu-sim).
+
 ## Step 3: Run
 
 Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "--cudart shared" in makefile (quotes should be excluded).

From 6c9e13db93e4a1614f7401e9675c62ea40b65a3b Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Sun, 23 May 2021 12:59:34 -0400
Subject: [PATCH 079/133] format code

---
 src/abstract_hardware_model.cc |  12 ++--
 src/abstract_hardware_model.h  |  13 ++--
 src/cuda-sim/instructions.cc   |  99 ++++++++++++++-------------
 src/cuda-sim/ptx_ir.cc         |   4 +-
 src/cuda-sim/ptx_ir.h          |   4 +-
 src/cuda-sim/ptx_parser.cc     |  14 ++--
 src/gpgpu-sim/gpu-cache.cc     |  89 ++++++++++++------------
 src/gpgpu-sim/gpu-cache.h      |  84 +++++++++++------------
 src/gpgpu-sim/gpu-sim.cc       |  12 ++--
 src/gpgpu-sim/l2cache.cc       |  57 ++++++++--------
 src/gpgpu-sim/l2cache.h        |  13 ++--
 src/gpgpu-sim/shader.cc        | 119 +++++++++++++++++----------------
 src/gpgpu-sim/shader.h         |  17 ++---
 13 files changed, 273 insertions(+), 264 deletions(-)

diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index e0e1d23cf..30aee60c9 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -448,7 +448,8 @@ void warp_inst_t::generate_mem_accesses() {
     for (unsigned thread = 0; thread < m_config->warp_size; thread++) {
       if (!active(thread)) continue;
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      new_addr_type block_address = line_size_based_tag_func(addr, cache_block_size);
+      new_addr_type block_address =
+          line_size_based_tag_func(addr, cache_block_size);
       accesses[block_address].set(thread);
       unsigned idx = addr - block_address;
       for (unsigned i = 0; i < data_size; i++) byte_mask.set(idx + i);
@@ -530,7 +531,8 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
            (m_per_scalar_thread[thread].memreqaddr[access] != 0);
            access++) {
         new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[access];
-        new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
+        new_addr_type block_address =
+            line_size_based_tag_func(addr, segment_size);
         unsigned chunk =
             (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte
                                 // chunk does this thread access?
@@ -552,7 +554,8 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
         if (block_address != line_size_based_tag_func(
                                  addr + data_size_coales - 1, segment_size)) {
           addr = addr + data_size_coales - 1;
-          new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
+          new_addr_type block_address =
+              line_size_based_tag_func(addr, segment_size);
           unsigned chunk = (addr & 127) / 32;
           transaction_info &info = subwarp_transactions[block_address];
           info.chunks.set(chunk);
@@ -625,7 +628,8 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       if (!active(thread)) continue;
 
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
+      new_addr_type block_address =
+          line_size_based_tag_func(addr, segment_size);
       unsigned chunk =
           (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte chunk
                               // does this thread access?
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 60d7328e7..35e28ca57 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -872,13 +872,12 @@ class mem_fetch_allocator {
                            const mem_access_t &access,
                            unsigned long long cycle) const = 0;
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const = 0;                    
+                           const active_mask_t &active_mask,
+                           const mem_access_byte_mask_t &byte_mask,
+                           const mem_access_sector_mask_t &sector_mask,
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned wid, unsigned sid, unsigned tpc,
+                           mem_fetch *original_mf) const = 0;
 };
 
 // the maximum number of destination, source, or address uarch operands in a
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 8936fa80e..0b990e83c 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -166,8 +166,9 @@ void inst_not_implemented(const ptx_instruction *pI);
 ptx_reg_t srcOperandModifiers(ptx_reg_t opData, operand_info opInfo,
                               operand_info dstInfo, unsigned type,
                               ptx_thread_info *thread);
-                              
-void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, int op_code);
+
+void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread,
+                           int op_code);
 
 void sign_extend(ptx_reg_t &data, unsigned src_size, const operand_info &dst);
 
@@ -1711,40 +1712,50 @@ void bfi_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   }
   thread->set_operand_value(dst, data, i_type, thread, pI);
 }
-void bfind_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
-  const operand_info &dst  = pI->dst();
+void bfind_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  const operand_info &dst = pI->dst();
   const operand_info &src1 = pI->src1();
   const unsigned i_type = pI->get_type();
 
-  const ptx_reg_t src1_data = thread->get_operand_value(src1, dst, i_type, thread, 1);
-  const int msb = ( i_type == U32_TYPE || i_type == S32_TYPE) ? 31 : 63;
+  const ptx_reg_t src1_data =
+      thread->get_operand_value(src1, dst, i_type, thread, 1);
+  const int msb = (i_type == U32_TYPE || i_type == S32_TYPE) ? 31 : 63;
 
   unsigned long a = 0;
-  switch (i_type)
-  {
-    case S32_TYPE: a = src1_data.s32; break;
-    case U32_TYPE: a = src1_data.u32; break;
-    case S64_TYPE: a = src1_data.s64; break;
-    case U64_TYPE: a = src1_data.u64; break;
-    default: assert(false); abort();
+  switch (i_type) {
+    case S32_TYPE:
+      a = src1_data.s32;
+      break;
+    case U32_TYPE:
+      a = src1_data.u32;
+      break;
+    case S64_TYPE:
+      a = src1_data.s64;
+      break;
+    case U64_TYPE:
+      a = src1_data.u64;
+      break;
+    default:
+      assert(false);
+      abort();
   }
 
   // negate negative signed inputs
-  if ( ( i_type == S32_TYPE || i_type == S64_TYPE ) && ( a & ( 1 << msb ) ) ) {
-      a = ~a;
+  if ((i_type == S32_TYPE || i_type == S64_TYPE) && (a & (1 << msb))) {
+    a = ~a;
   }
   uint32_t d_data = 0xffffffff;
   for (uint32_t i = msb; i >= 0; i--) {
-      if (a & (1<<i))  { d_data = i; break; }
+    if (a & (1 << i)) {
+      d_data = i;
+      break;
+    }
   }
 
   // if (.shiftamt && d != 0xffffffff)  { d = msb - d; }
 
   // store d
   thread->set_operand_value(dst, d_data, U32_TYPE, thread, pI);
-
-
 }
 
 void bra_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
@@ -6339,12 +6350,10 @@ void vmad_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
 #define VMAX 0
 #define VMIN 1
 
-void vmax_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
-   video_mem_instruction(pI, thread, VMAX);
+void vmax_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  video_mem_instruction(pI, thread, VMAX);
 }
-void vmin_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
+void vmin_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   video_mem_instruction(pI, thread, VMIN);
 }
 void vset_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
@@ -6440,12 +6449,12 @@ void vote_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   }
 }
 
-void activemask_impl( const ptx_instruction *pI, ptx_thread_info *thread )
-{
+void activemask_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   active_mask_t l_activemask_bitset = pI->get_warp_active_mask();
-  uint32_t l_activemask_uint = static_cast<uint32_t>(l_activemask_bitset.to_ulong());
+  uint32_t l_activemask_uint =
+      static_cast<uint32_t>(l_activemask_bitset.to_ulong());
 
-  const operand_info &dst  = pI->dst();
+  const operand_info &dst = pI->dst();
   thread->set_operand_value(dst, l_activemask_uint, U32_TYPE, thread, pI);
 }
 
@@ -6527,12 +6536,12 @@ ptx_reg_t srcOperandModifiers(ptx_reg_t opData, operand_info opInfo,
   return result;
 }
 
-void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, int op_code)
-{
-  const operand_info &dst  = pI->dst(); // d
-  const operand_info &src1 = pI->src1(); // a
-  const operand_info &src2 = pI->src2(); // b
-  const operand_info &src3 = pI->src3(); // c
+void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread,
+                           int op_code) {
+  const operand_info &dst = pI->dst();    // d
+  const operand_info &src1 = pI->src1();  // a
+  const operand_info &src2 = pI->src2();  // b
+  const operand_info &src3 = pI->src3();  // c
 
   const unsigned i_type = pI->get_type();
 
@@ -6557,19 +6566,18 @@ void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, i
   auto option = options.begin();
   assert(*option == ATOMIC_MAX || *option == ATOMIC_MIN);
 
-  switch ( i_type ) {
+  switch (i_type) {
     case S32_TYPE: {
       // assert all operands are S32_TYPE:
       scalar_type = pI->get_scalar_type();
-      for (std::list<int>::iterator scalar = scalar_type.begin(); scalar != scalar_type.end(); scalar++)
-      {
+      for (std::list<int>::iterator scalar = scalar_type.begin();
+           scalar != scalar_type.end(); scalar++) {
         assert(*scalar == S32_TYPE);
       }
       assert(scalar_type.size() == 3);
       scalar_type.clear();
 
-      switch (op_code)
-      {
+      switch (op_code) {
         case VMAX:
           data.s32 = MY_MAX_I(ta.s32, tb.s32);
           break;
@@ -6580,26 +6588,23 @@ void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, i
           assert(0);
       }
 
-      switch (*option)
-      {
+      switch (*option) {
         case ATOMIC_MAX:
           data.s32 = MY_MAX_I(data.s32, c.s32);
-        break;
+          break;
         case ATOMIC_MIN:
           data.s32 = MY_MIN_I(data.s32, c.s32);
-        break;
+          break;
         default:
-          assert(0); // not yet implemented
+          assert(0);  // not yet implemented
       }
       break;
-
     }
     default:
-      assert(0); // not yet implemented
+      assert(0);  // not yet implemented
   }
 
   thread->set_operand_value(dst, data, i_type, thread, pI);
 
   return;
 }
-
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index e5b5fb773..d3da4b541 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1147,8 +1147,8 @@ static std::list<operand_info> check_operands(
     const std::list<operand_info> &operands, gpgpu_context *ctx) {
   static int g_warn_literal_operands_two_type_inst;
   if ((opcode == CVT_OP) || (opcode == SET_OP) || (opcode == SLCT_OP) ||
-      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) || 
-      (opcode == VMIN_OP) || (opcode == VMAX_OP) ) {
+      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) ||
+      (opcode == VMIN_OP) || (opcode == VMAX_OP)) {
     // just make sure these do not have have const operands...
     if (!g_warn_literal_operands_two_type_inst) {
       std::list<operand_info>::const_iterator o;
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index 42439412c..825175964 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -966,8 +966,8 @@ class ptx_instruction : public warp_inst_t {
   int get_pred_mod() const { return m_pred_mod; }
   const char *get_source() const { return m_source.c_str(); }
 
-  const std::list<int> get_scalar_type() const {return m_scalar_type;}
-  const std::list<int> get_options() const {return m_options;}
+  const std::list<int> get_scalar_type() const { return m_scalar_type; }
+  const std::list<int> get_options() const { return m_options; }
 
   typedef std::vector<operand_info>::const_iterator const_iterator;
 
diff --git a/src/cuda-sim/ptx_parser.cc b/src/cuda-sim/ptx_parser.cc
index afdb41ba8..86a33c2d3 100644
--- a/src/cuda-sim/ptx_parser.cc
+++ b/src/cuda-sim/ptx_parser.cc
@@ -622,13 +622,13 @@ void ptx_recognizer::add_scalar_type_spec(int type_spec) {
                     g_ptx_token_decode[type_spec].c_str());
   g_scalar_type.push_back(type_spec);
   if (g_scalar_type.size() > 1) {
-    parse_assert((g_opcode == -1) || (g_opcode == CVT_OP) ||
-                     (g_opcode == SET_OP) || (g_opcode == SLCT_OP) ||
-                     (g_opcode == TEX_OP) || (g_opcode == MMA_OP) ||
-                     (g_opcode == DP4A_OP) || (g_opcode == VMIN_OP) || 
-                     (g_opcode == VMAX_OP),
-                 "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
-                 "type specifier.");
+    parse_assert(
+        (g_opcode == -1) || (g_opcode == CVT_OP) || (g_opcode == SET_OP) ||
+            (g_opcode == SLCT_OP) || (g_opcode == TEX_OP) ||
+            (g_opcode == MMA_OP) || (g_opcode == DP4A_OP) ||
+            (g_opcode == VMIN_OP) || (g_opcode == VMAX_OP),
+        "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
+        "type specifier.");
   }
   g_scalar_type_spec = type_spec;
 }
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 7e7d2adc4..28d3215ae 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -37,7 +37,8 @@
 
 const char *cache_request_status_str(enum cache_request_status status) {
   static const char *static_cache_request_status_str[] = {
-      "HIT", "HIT_RESERVED", "MISS", "RESERVATION_FAIL", "SECTOR_MISS", "MSHR_HIT"};
+      "HIT",         "HIT_RESERVED", "MISS", "RESERVATION_FAIL",
+      "SECTOR_MISS", "MSHR_HIT"};
 
   assert(sizeof(static_cache_request_status_str) / sizeof(const char *) ==
          NUM_CACHE_REQUEST_STATUS);
@@ -63,9 +64,9 @@ unsigned l1d_cache_config::set_bank(new_addr_type addr) const {
   // For sector cache, we select one sector per bank (sector interleaving)
   // This is what was found in Volta (one sector per bank, sector interleaving)
   // otherwise, line interleaving
-  return cache_config::hash_function(addr, l1_banks, l1_banks_byte_interleaving_log2,
-                                     l1_banks_log2,
-                                     l1_banks_hashing_function);
+  return cache_config::hash_function(addr, l1_banks,
+                                     l1_banks_byte_interleaving_log2,
+                                     l1_banks_log2, l1_banks_hashing_function);
 }
 
 unsigned cache_config::set_index(new_addr_type addr) const {
@@ -235,7 +236,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
                                            mem_fetch *mf, bool is_write,
                                            bool probe_mode) const {
   mem_access_sector_mask_t mask = mf->get_access_sector_mask();
-  return probe(addr, idx, mask,is_write, probe_mode, mf);
+  return probe(addr, idx, mask, is_write, probe_mode, mf);
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
@@ -281,8 +282,8 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
     if (!line->is_reserved_line()) {
       // percentage of dirty lines in the cache
       // number of dirty lines / total lines in the cache
-      float dirty_line_percentage = 
-          ((float) m_dirty / (m_config.m_nset * m_config.m_assoc )) * 100;
+      float dirty_line_percentage =
+          ((float)m_dirty / (m_config.m_nset * m_config.m_assoc)) * 100;
       if (!line->is_modified_line() ||
           dirty_line_percentage >= m_config.m_wr_percent) {
         // if number of dirty lines in the cache is greater than
@@ -357,7 +358,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
                            m_lines[idx]->get_dirty_byte_mask(),
-                            m_lines[idx]->get_dirty_sector_mask());
+                           m_lines[idx]->get_dirty_sector_mask());
           m_dirty--;
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
@@ -372,9 +373,9 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
         bool before = m_lines[idx]->is_modified_line();
         ((sector_cache_block *)m_lines[idx])
             ->allocate_sector(time, mf->get_access_sector_mask());
-            if (before && !m_lines[idx]->is_modified_line()) {
-              m_dirty--;
-            }
+        if (before && !m_lines[idx]->is_modified_line()) {
+          m_dirty--;
+        }
       }
       break;
     case RESERVATION_FAIL:
@@ -391,16 +392,18 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   return status;
 }
 
-void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write) {
-  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask(), is_write);
+void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf,
+                     bool is_write) {
+  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask(),
+       is_write);
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time,
-                     mem_access_sector_mask_t mask, mem_access_byte_mask_t byte_mask,
-                     bool is_write) {
+                     mem_access_sector_mask_t mask,
+                     mem_access_byte_mask_t byte_mask, bool is_write) {
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
-  enum cache_request_status status = probe(addr, idx, mask,is_write);
+  enum cache_request_status status = probe(addr, idx, mask, is_write);
   bool before = m_lines[idx]->is_modified_line();
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
@@ -423,7 +426,8 @@ void tag_array::fill(new_addr_type addr, unsigned time,
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+  m_lines[index]->fill(time, mf->get_access_sector_mask(),
+                       mf->get_access_byte_mask());
   m_dirty++;
 }
 
@@ -437,7 +441,7 @@ void tag_array::flush() {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
       }
     }
-  
+
   m_dirty = 0;
   is_used = false;
 }
@@ -794,8 +798,8 @@ void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
               m_stats[type][status]);
 
       if (status != RESERVATION_FAIL && status != MSHR_HIT)
-      // MSHR_HIT is a special type of SECTOR_MISS
-      // so its already included in the SECTOR_MISS
+        // MSHR_HIT is a special type of SECTOR_MISS
+        // so its already included in the SECTOR_MISS
         total_access[type] += m_stats[type][status];
     }
   }
@@ -1335,10 +1339,10 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
       assert(status ==
              MISS);  // SECTOR_MISS and HIT_RESERVED should not send write back
       mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1388,10 +1392,10 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+            evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+            evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+            true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+            NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1461,10 +1465,10 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+            evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+            evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+            true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+            NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1514,7 +1518,6 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
   }
 
-
   bool wb = false;
   evicted_block_info evicted;
 
@@ -1538,7 +1541,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   } else {
     block->set_m_readable(false, mf->get_access_sector_mask());
     if (m_status == HIT_RESERVED)
-        block->set_readable_on_fill(true, mf->get_access_sector_mask());
+      block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
 
   if (m_status != RESERVATION_FAIL) {
@@ -1546,10 +1549,10 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1596,7 +1599,7 @@ enum cache_request_status data_cache::rd_hit_base(
       m_tag_array->inc_dirty();
     }
     block->set_status(MODIFIED,
-                      mf->get_access_sector_mask());  // mark line as 
+                      mf->get_access_sector_mask());  // mark line as
     block->set_byte_mask(mf);
   }
   return HIT;
@@ -1628,10 +1631,10 @@ enum cache_request_status data_cache::rd_miss_base(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 007403f5a..7a2a8d94d 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -84,7 +84,7 @@ struct evicted_block_info {
     m_block_addr = block_addr;
     m_modified_size = modified_size;
   }
-  void set_info(new_addr_type block_addr, unsigned modified_size, 
+  void set_info(new_addr_type block_addr, unsigned modified_size,
                 mem_access_byte_mask_t byte_mask,
                 mem_access_sector_mask_t sector_mask) {
     m_block_addr = block_addr;
@@ -121,8 +121,8 @@ struct cache_block_t {
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
                         unsigned time,
                         mem_access_sector_mask_t sector_mask) = 0;
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask, 
-                      mem_access_byte_mask_t byte_mask) = 0;
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) = 0;
 
   virtual bool is_invalid_line() = 0;
   virtual bool is_valid_line() = 0;
@@ -183,15 +183,14 @@ struct line_cache_block : public cache_block_t {
     m_set_readable_on_fill = false;
     m_set_byte_mask_on_fill = false;
   }
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask, 
-              mem_access_byte_mask_t byte_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) {
     // if(!m_ignore_on_fill_status)
     //	assert( m_status == RESERVED );
 
     m_status = m_set_modified_on_fill ? MODIFIED : VALID;
-    
-    if (m_set_readable_on_fill)
-        m_readable = true;
+
+    if (m_set_readable_on_fill) m_readable = true;
     if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
 
     m_fill_time = time;
@@ -358,10 +357,10 @@ struct sector_cache_block : public cache_block_t {
     //	if(!m_ignore_on_fill_status[sidx])
     //	         assert( m_status[sidx] == RESERVED );
     m_status[sidx] = m_set_modified_on_fill[sidx] ? MODIFIED : VALID;
-    
+
     if (m_set_readable_on_fill[sidx]) {
-        m_readable[sidx] = true;
-        m_set_readable_on_fill[sidx] = false;
+      m_readable[sidx] = true;
+      m_set_readable_on_fill[sidx] = false;
     }
     if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
 
@@ -416,8 +415,7 @@ struct sector_cache_block : public cache_block_t {
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
-      if (m_status[i] == MODIFIED) 
-        sector_mask.set(i);
+      if (m_status[i] == MODIFIED) sector_mask.set(i);
     }
     return sector_mask;
   }
@@ -575,7 +573,7 @@ class cache_config {
       }
       exit_parse_error();
     }
-    
+
     switch (ct) {
       case 'N':
         m_cache_type = NORMAL;
@@ -631,18 +629,19 @@ class cache_config {
     if (m_alloc_policy == STREAMING) {
       /*
       For streaming cache:
-      (1) we set the alloc policy to be on-fill to remove all line_alloc_fail stalls.
-      if the whole memory is allocated to the L1 cache, then make the allocation to be on_MISS
-      otherwise, make it ON_FILL to eliminate line allocation fails. 
-      i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
-      So, we set the allocation policy per kernel basis, see shader.cc, max_cta() function
-      
+      (1) we set the alloc policy to be on-fill to remove all line_alloc_fail
+      stalls. if the whole memory is allocated to the L1 cache, then make the
+      allocation to be on_MISS otherwise, make it ON_FILL to eliminate line
+      allocation fails. i.e. MSHR throughput is the same, independent on the L1
+      cache size/associativity So, we set the allocation policy per kernel
+      basis, see shader.cc, max_cta() function
+
       (2) We also set the MSHRs to be equal to max
       allocated cache lines. This is possible by moving TAG to be shared
       between cache line and MSHR enrty (i.e. for each cache line, there is
       an MSHR rntey associated with it). This is the easiest think we can
       think of to model (mimic) L1 streaming cache in Pascal and Volta
-      
+
       For more information about streaming cache, see:
       http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
       https://ieeexplore.ieee.org/document/8344474/
@@ -697,8 +696,8 @@ class cache_config {
     }
 
     // detect invalid configuration
-    if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) 
-        and m_write_policy == WRITE_BACK) {
+    if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) and
+        m_write_policy == WRITE_BACK) {
       // A writeback cache with allocate-on-fill policy will inevitably lead to
       // deadlock: The deadlock happens when an incoming cache-fill evicts a
       // dirty line, generating a writeback request.  If the memory subsystem is
@@ -746,7 +745,7 @@ class cache_config {
         break;
       case 'X':
         m_set_index_function = BITWISE_XORING_FUNCTION;
-        break;        
+        break;
       default:
         exit_parse_error();
     }
@@ -779,7 +778,9 @@ class cache_config {
 
   virtual unsigned set_index(new_addr_type addr) const;
 
-  virtual unsigned get_max_cache_multiplier() const { return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;}
+  virtual unsigned get_max_cache_multiplier() const {
+    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+  }
 
   unsigned hash_function(new_addr_type addr, unsigned m_nset,
                          unsigned m_line_sz_log2, unsigned m_nset_log2,
@@ -826,9 +827,7 @@ class cache_config {
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
   }
-  write_policy_t get_write_policy() {
-    return m_write_policy;
-  }
+  write_policy_t get_write_policy() { return m_write_policy; }
 
  protected:
   void exit_parse_error() {
@@ -903,17 +902,17 @@ class l1d_cache_config : public cache_config {
   unsigned l1_banks_byte_interleaving_log2;
   unsigned l1_banks_hashing_function;
   unsigned m_unified_cache_size;
-  virtual unsigned get_max_cache_multiplier() const { 
-      // set * assoc * cacheline size. Then convert Byte to KB
-      // gpgpu_unified_cache_size is in KB while original_sz is in B
-      if (m_unified_cache_size > 0) {
-        unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
-        assert(m_unified_cache_size % original_size == 0);
-        return m_unified_cache_size / original_size;
-      } else {
-        return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
-      }    
+  virtual unsigned get_max_cache_multiplier() const {
+    // set * assoc * cacheline size. Then convert Byte to KB
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
+    if (m_unified_cache_size > 0) {
+      unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
+      assert(m_unified_cache_size % original_size == 0);
+      return m_unified_cache_size / original_size;
+    } else {
+      return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
     }
+  }
 };
 
 class l2_cache_config : public cache_config {
@@ -936,8 +935,7 @@ class tag_array {
                                   mem_fetch *mf, bool is_write,
                                   bool probe_mode = false) const;
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
-                                  mem_access_sector_mask_t mask,
-                                  bool is_write,
+                                  mem_access_sector_mask_t mask, bool is_write,
                                   bool probe_mode = false,
                                   mem_fetch *mf = NULL) const;
   enum cache_request_status access(new_addr_type addr, unsigned time,
@@ -948,7 +946,7 @@ class tag_array {
 
   void fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
-  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask, 
+  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask,
             mem_access_byte_mask_t byte_mask, bool is_write);
 
   unsigned size() const { return m_config.get_num_lines(); }
@@ -967,9 +965,7 @@ class tag_array {
   void update_cache_parameters(cache_config &config);
   void add_pending_line(mem_fetch *mf);
   void remove_pending_line(mem_fetch *mf);
-  void inc_dirty() {
-    m_dirty++;
-  }
+  void inc_dirty() { m_dirty++; }
 
  protected:
   // This constructor is intended for use only from derived classes that wish to
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index df3004772..56ede056c 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -249,7 +249,8 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
                          "none");
-  option_parser_register(opp,"-gpgpu_l1_cache_write_ratio",OPT_UINT32,&m_L1D_config.m_wr_percent,"L1D write ratio","0");
+  option_parser_register(opp, "-gpgpu_l1_cache_write_ratio", OPT_UINT32,
+                         &m_L1D_config.m_wr_percent, "L1D write ratio", "0");
   option_parser_register(opp, "-gpgpu_l1_banks", OPT_UINT32,
                          &m_L1D_config.l1_banks, "The number of L1 cache banks",
                          "1");
@@ -327,11 +328,12 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(
       opp, "-gpgpu_shmem_size", OPT_UINT32, &gpgpu_shmem_size,
       "Size of shared memory per shader core (default 16kB)", "16384");
+  option_parser_register(opp, "-gpgpu_shmem_option", OPT_CSTR,
+                         &gpgpu_shmem_option,
+                         "Option list of shared memory sizes", "0");
   option_parser_register(
-      opp, "-gpgpu_shmem_option", OPT_CSTR, &gpgpu_shmem_option,
-      "Option list of shared memory sizes", "0");
-  option_parser_register(
-      opp, "-gpgpu_unified_l1d_size", OPT_UINT32, &m_L1D_config.m_unified_cache_size,
+      opp, "-gpgpu_unified_l1d_size", OPT_UINT32,
+      &m_L1D_config.m_unified_cache_size,
       "Size of unified data cache(L1D + shared memory) in KB", "0");
   option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 0db6bd44c..57e8ea97c 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -57,20 +57,18 @@ mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
   return mf;
 }
 
-mem_fetch *partition_mf_allocator::alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const {
-  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
-                        sector_mask, m_memory_config->gpgpu_ctx);
+mem_fetch *partition_mf_allocator::alloc(
+    new_addr_type addr, mem_access_type type, const active_mask_t &active_mask,
+    const mem_access_byte_mask_t &byte_mask,
+    const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
+    unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+    mem_fetch *original_mf) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
+                      m_memory_config->gpgpu_ctx);
   mem_fetch *mf =
-    new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
-                  sid, tpc, m_memory_config, cycle,original_mf);
-    return mf;
+      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
+                    wid, sid, tpc, m_memory_config, cycle, original_mf);
+  return mf;
 }
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
                                              const memory_config *config,
@@ -725,11 +723,12 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
         mask.set(k);
       }
-      mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr() + SECTOR_SIZE * i, 
-        mf->get_access_type(),mf->get_access_warp_mask(), 
-        mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
-        SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-        mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(
+          mf->get_addr() + SECTOR_SIZE * i, mf->get_access_type(),
+          mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
+          mf->get_sid(), mf->get_tpc(), mf);
 
       result.push_back(n_mf);
     }
@@ -746,11 +745,12 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
         mask.set(k);
       }
-      mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr(), 
-        mf->get_access_type(),mf->get_access_warp_mask(), 
-        mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
-        SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-        mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(
+          mf->get_addr(), mf->get_access_type(), mf->get_access_warp_mask(),
+          mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
+          mf->get_sid(), mf->get_tpc(), mf);
 
       result.push_back(n_mf);
     }
@@ -761,11 +761,12 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
         for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
           mask.set(k);
         }
-        mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr() + SECTOR_SIZE * i, 
-          mf->get_access_type(),mf->get_access_warp_mask(), 
-          mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
-          SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-          mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
+        mem_fetch *n_mf = m_mf_allocator->alloc(
+            mf->get_addr() + SECTOR_SIZE * i, mf->get_access_type(),
+            mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
+            std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE,
+            mf->is_write(), m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
 
         result.push_back(n_mf);
       }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 59432b88d..beed76562 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -52,13 +52,12 @@ class partition_mf_allocator : public mem_fetch_allocator {
                            unsigned size, bool wr,
                            unsigned long long cycle) const;
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const;
+                           const active_mask_t &active_mask,
+                           const mem_access_byte_mask_t &byte_mask,
+                           const mem_access_sector_mask_t &sector_mask,
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned wid, unsigned sid, unsigned tpc,
+                           mem_fetch *original_mf) const;
 
  private:
   const memory_config *m_memory_config;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 51366deb4..c65affdb6 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -62,21 +62,19 @@ mem_fetch *shader_core_mem_fetch_allocator::alloc(
   return mf;
 }
 
-mem_fetch *shader_core_mem_fetch_allocator::alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const {
-    mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
-                          sector_mask, m_memory_config->gpgpu_ctx);
-    mem_fetch *mf =
-      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
-                    m_core_id, m_cluster_id, m_memory_config, cycle,original_mf);
-      return mf;
-  }
+mem_fetch *shader_core_mem_fetch_allocator::alloc(
+    new_addr_type addr, mem_access_type type, const active_mask_t &active_mask,
+    const mem_access_byte_mask_t &byte_mask,
+    const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
+    unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+    mem_fetch *original_mf) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
+                      m_memory_config->gpgpu_ctx);
+  mem_fetch *mf = new mem_fetch(
+      access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid, m_core_id,
+      m_cluster_id, m_memory_config, cycle, original_mf);
+  return mf;
+}
 /////////////////////////////////////////////////////////////////////////////
 
 std::list<unsigned> shader_core_ctx::get_regs_written(const inst_t &fvt) const {
@@ -142,8 +140,8 @@ void shader_core_ctx::create_front_pipeline() {
              m_pipeline_reg[ID_OC_INT].get_size());
     for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
       if (m_config->m_specialized_unit[j].num_units > 0)
-         assert(m_config->gpgpu_num_sched_per_core ==
-             m_config->m_specialized_unit[j].id_oc_spec_reg_width);
+        assert(m_config->gpgpu_num_sched_per_core ==
+               m_config->m_specialized_unit[j].id_oc_spec_reg_width);
     }
   }
 
@@ -187,15 +185,18 @@ void shader_core_ctx::create_schedulers() {
   // must currently occur after all inputs have been initialized.
   std::string sched_config = m_config->gpgpu_scheduler_string;
   const concrete_scheduler scheduler =
-      sched_config.find("lrr") != std::string::npos ? CONCRETE_SCHEDULER_LRR
-      : sched_config.find("two_level_active") != std::string::npos
-          ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
-      : sched_config.find("gto") != std::string::npos ? CONCRETE_SCHEDULER_GTO
-      : sched_config.find("old") != std::string::npos
-          ? CONCRETE_SCHEDULER_OLDEST_FIRST
-      : sched_config.find("warp_limiting") != std::string::npos
-          ? CONCRETE_SCHEDULER_WARP_LIMITING
-          : NUM_CONCRETE_SCHEDULERS;
+      sched_config.find("lrr") != std::string::npos
+          ? CONCRETE_SCHEDULER_LRR
+          : sched_config.find("two_level_active") != std::string::npos
+                ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
+                : sched_config.find("gto") != std::string::npos
+                      ? CONCRETE_SCHEDULER_GTO
+                      : sched_config.find("old") != std::string::npos
+                            ? CONCRETE_SCHEDULER_OLDEST_FIRST
+                            : sched_config.find("warp_limiting") !=
+                                      std::string::npos
+                                  ? CONCRETE_SCHEDULER_WARP_LIMITING
+                                  : NUM_CONCRETE_SCHEDULERS;
   assert(scheduler != NUM_CONCRETE_SCHEDULERS);
 
   for (unsigned i = 0; i < m_config->gpgpu_num_sched_per_core; i++) {
@@ -1246,20 +1247,21 @@ void scheduler_unit::cycle() {
                 previous_issued_inst_exec_type = exec_unit_type_t::MEM;
               }
             } else {
-
               // This code need to be refactored
               if (pI->op != TENSOR_CORE_OP && pI->op != SFU_OP &&
                   pI->op != DP_OP && !(pI->op >= SPEC_UNIT_START_ID)) {
                 bool execute_on_SP = false;
                 bool execute_on_INT = false;
 
-              bool sp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sp_units > 0) &&
-                  m_sp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool int_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_int_units > 0) &&
-                  m_int_out->has_free(m_shader->m_config->sub_core_model, m_id);
-                  
+                bool sp_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_sp_units > 0) &&
+                    m_sp_out->has_free(m_shader->m_config->sub_core_model,
+                                       m_id);
+                bool int_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_int_units > 0) &&
+                    m_int_out->has_free(m_shader->m_config->sub_core_model,
+                                        m_id);
+
                 // if INT unit pipline exist, then execute ALU and INT
                 // operations on INT unit and SP-FPU on SP unit (like in Volta)
                 // if INT unit pipline does not exist, then execute all ALU, INT
@@ -1320,10 +1322,10 @@ void scheduler_unit::cycle() {
                          (pI->op == DP_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::DP)) {
-               
                 bool dp_pipe_avail =
-                (m_shader->m_config->gpgpu_num_dp_units > 0) &&
-                m_dp_out->has_free(m_shader->m_config->sub_core_model, m_id);
+                    (m_shader->m_config->gpgpu_num_dp_units > 0) &&
+                    m_dp_out->has_free(m_shader->m_config->sub_core_model,
+                                       m_id);
 
                 if (dp_pipe_avail) {
                   m_shader->issue_warp(*m_dp_out, pI, active_mask, warp_id,
@@ -1340,10 +1342,10 @@ void scheduler_unit::cycle() {
                         (pI->op == SFU_OP) || (pI->op == ALU_SFU_OP)) &&
                        !(diff_exec_units && previous_issued_inst_exec_type ==
                                                 exec_unit_type_t::SFU)) {
-
                 bool sfu_pipe_avail =
-                (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
-                m_sfu_out->has_free(m_shader->m_config->sub_core_model, m_id);
+                    (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
+                    m_sfu_out->has_free(m_shader->m_config->sub_core_model,
+                                        m_id);
 
                 if (sfu_pipe_avail) {
                   m_shader->issue_warp(*m_sfu_out, pI, active_mask, warp_id,
@@ -1356,11 +1358,10 @@ void scheduler_unit::cycle() {
               } else if ((pI->op == TENSOR_CORE_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::TENSOR)) {
-                  
                 bool tensor_core_pipe_avail =
-                (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
-                m_tensor_core_out->has_free(
-                    m_shader->m_config->sub_core_model, m_id);
+                    (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
+                    m_tensor_core_out->has_free(
+                        m_shader->m_config->sub_core_model, m_id);
 
                 if (tensor_core_pipe_avail) {
                   m_shader->issue_warp(*m_tensor_core_out, pI, active_mask,
@@ -2007,8 +2008,10 @@ void ldst_unit::L1_latency_queue_cycle() {
         l1_latency_queue[j][0] = NULL;
         if (m_config->m_L1D_config.get_write_policy() != WRITE_THROUGH &&
             mf_next->get_inst().is_store() &&
-            (m_config->m_L1D_config.get_write_allocate_policy() == FETCH_ON_WRITE ||
-            m_config->m_L1D_config.get_write_allocate_policy() == LAZY_FETCH_ON_READ) &&
+            (m_config->m_L1D_config.get_write_allocate_policy() ==
+                 FETCH_ON_WRITE ||
+             m_config->m_L1D_config.get_write_allocate_policy() ==
+                 LAZY_FETCH_ON_READ) &&
             !was_writeallocate_sent(events)) {
           unsigned dec_ack =
               (m_config->m_L1D_config.get_mshr_type() == SECTOR_ASSOC)
@@ -2316,7 +2319,7 @@ void dp_unit ::issue(register_set &source_reg) {
 }
 
 void specialized_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = 
+  warp_inst_t **ready_reg =
       source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SPECIALIZED__OP;
@@ -3349,15 +3352,15 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         unsigned max_assoc = m_L1D_config.get_max_assoc();
 
         for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
-              it < shmem_opt_list.end(); it++) {
+             it < shmem_opt_list.end(); it++) {
           if (total_shmem <= *it) {
-            float l1_ratio = 1 - ((float) *(it) / total_unified);
+            float l1_ratio = 1 - ((float)*(it) / total_unified);
             m_L1D_config.set_assoc(max_assoc * l1_ratio);
             l1d_configured = true;
             break;
           }
         }
-        
+
         assert(l1d_configured && "no shared memory option found");
         break;
       }
@@ -3365,16 +3368,16 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         assert(0);
     }
 
-    if(m_L1D_config.is_streaming()) {
-      //for streaming cache, if the whole memory is allocated
-      //to the L1 cache, then make the allocation to be on_MISS
-      //otherwise, make it ON_FILL to eliminate line allocation fails
-      //i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
-      if(total_shmem == 0) {
+    if (m_L1D_config.is_streaming()) {
+      // for streaming cache, if the whole memory is allocated
+      // to the L1 cache, then make the allocation to be on_MISS
+      // otherwise, make it ON_FILL to eliminate line allocation fails
+      // i.e. MSHR throughput is the same, independent on the L1 cache
+      // size/associativity
+      if (total_shmem == 0) {
         m_L1D_config.set_allocation_policy(ON_MISS);
         printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
-      }
-      else {
+      } else {
         m_L1D_config.set_allocation_policy(ON_FILL);
         printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
       }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 866231357..2d2f051b5 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1496,8 +1496,8 @@ class shader_core_config : public core_config {
         break;  // we only accept continuous specialized_units, i.e., 1,2,3,4
     }
 
-    //parse gpgpu_shmem_option for adpative cache config
-    if(adaptive_cache_config) {
+    // parse gpgpu_shmem_option for adpative cache config
+    if (adaptive_cache_config) {
       for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
         char option[4];
         int j = 0;
@@ -1520,7 +1520,6 @@ class shader_core_config : public core_config {
       }
       std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
     }
-
   }
   void reg_options(class OptionParser *opp);
   unsigned max_cta(const kernel_info_t &k) const;
@@ -1899,13 +1898,11 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
   mem_fetch *alloc(new_addr_type addr, mem_access_type type, unsigned size,
                    bool wr, unsigned long long cycle) const;
   mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const;
+                   const active_mask_t &active_mask,
+                   const mem_access_byte_mask_t &byte_mask,
+                   const mem_access_sector_mask_t &sector_mask, unsigned size,
+                   bool wr, unsigned long long cycle, unsigned wid,
+                   unsigned sid, unsigned tpc, mem_fetch *original_mf) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;

From 778962ed40707369c97a03a3864cc1ee6c7470b6 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 26 May 2021 16:37:39 -0400
Subject: [PATCH 080/133] updating the configs based on the tuner output

---
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  | 118 ++++++++----------
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  | 108 +++++++---------
 2 files changed, 100 insertions(+), 126 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index f715f3aa4..f35af1b64 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -1,8 +1,3 @@
-# This config models the Turing RTX 2060
-# For more info about turing architecture:
-# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
-# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
-
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
@@ -13,7 +8,8 @@
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency 5000
+-gpgpu_kernel_launch_latency  7571
+-gpgpu_TB_launch_latency 0
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
@@ -27,31 +23,27 @@
 -gpgpu_n_clusters 30
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
--gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_n_sub_partition_per_mchannel 2
 
-# volta clock domains
+# clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1365.0:1365.0:1365.0:3500.0
-# boost mode
-# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+-gpgpu_clock_domains 1365:1365:1365:3500.5
 
 # shader core pipeline config
 -gpgpu_shader_registers 65536
 -gpgpu_registers_per_block 65536
 -gpgpu_occupancy_sm_number 75
 
-# This implies a maximum of 32 warps/SM
--gpgpu_shader_core_pipeline 1024:32 
--gpgpu_shader_cta 32
+-gpgpu_shader_core_pipeline 1024:32
+-gpgpu_shader_cta 16
 -gpgpu_simd_model 1
 
 # Pipeline widths and number of FUs
 # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
-## We need to scale the number of pipeline registers to be equal to the number of SP units
--gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
 -gpgpu_num_sp_units 4
 -gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
 -gpgpu_num_int_units 4
 -gpgpu_tensor_core_avail 1
 -gpgpu_num_tensor_core_units 4
@@ -59,32 +51,18 @@
 # Instruction latencies and initiation intervals
 # "ADD,MAX,MUL,MAD,DIV"
 # All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,13,4,5,145,32
--ptx_opcode_initiation_int 2,2,2,2,8,4
--ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,19,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 100
+-ptx_opcode_latency_dp 54,54,54,54,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
 -ptx_opcode_latency_tesnor 64
 -ptx_opcode_initiation_tensor 64
 
-# Turing has four schedulers per core
--gpgpu_num_sched_per_core 4
-# Greedy then oldest scheduler
--gpgpu_scheduler gto
-## In Turing, a warp scheduler can issue 1 inst per cycle
--gpgpu_max_insn_issue_per_warp 1
--gpgpu_dual_issue_diff_exec_units 1
-
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 75
-
-# Trung has sub core model, in which each scheduler has its own register file and EUs
+# sub core model: in which each scheduler has its own register file and EUs
 # i.e. schedulers are isolated
 -gpgpu_sub_core_model 1
 # disable specialized operand collectors and use generic operand collectors instead
@@ -92,31 +70,46 @@
 -gpgpu_operand_collector_num_units_gen 8
 -gpgpu_operand_collector_num_in_ports_gen 8
 -gpgpu_operand_collector_num_out_ports_gen 8
-# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
-# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
--gpgpu_num_reg_banks 16
+# register banks
+-gpgpu_num_reg_banks 8
 -gpgpu_reg_file_port_throughput 2
 
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler gto
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
--gpgpu_shmem_option 32,64
--gpgpu_unified_l1d_size 96
+-gpgpu_shmem_option 0,8,16,32,64,64
+-gpgpu_unified_l1d_size 64
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_l1_cache_write_ratio 25
--gpgpu_l1_latency 20
+-gpgpu_cache:dl1 S:4:128:128,L:T:m:L:L,A:256:32,16:0,32
+-gpgpu_l1_latency 32
 -gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
 -gpgpu_flush_l1_cache 1
-# shared memory configuration
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
 -gpgpu_shmem_size 65536
 -gpgpu_shmem_sizeDefault 65536
--gpgpu_shmem_per_block 65536
--gpgpu_smem_latency 20
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 30
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
 
-# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
+# L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
@@ -127,34 +120,31 @@
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
-# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
 -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
 # 64 KB Const
 -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
-#-network_mode 1 
-#-inter_config_file config_turing_islip.icnt
 # use built-in local xbar
 -network_mode 2
 -icnt_in_buffer_limit 512
 -icnt_out_buffer_limit 512
 -icnt_subnets 2
--icnt_arbiter_algo 1
 -icnt_flit_size 40
+-icnt_arbiter_algo 1
 
 # memory partition latency config 
--gpgpu_l2_rop_latency 160
--dram_latency 100
+-gpgpu_l2_rop_latency 194
+-dram_latency 96
 
-# dram model config
+# dram sched config
 -gpgpu_dram_scheduler 1
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# Turing has GDDR6
-# http://monitorinsider.com/GDDR6.html
+# dram model config
 -gpgpu_n_mem_per_ctrlr 1
 -gpgpu_dram_buswidth 2
 -gpgpu_dram_burst_length 16
@@ -162,9 +152,9 @@
 -gpgpu_mem_address_mask 1
 -gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
 
-# Use the same GDDR5 timing, scaled to 3500MHZ
--gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
-                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
 
 # select lower bits for bnkgrp to increase bnkgrp parallelism
 -dram_bnk_indexing_policy 0
@@ -179,7 +169,7 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
+# power model configs, disable it untill we create a real energy model
 -power_simulation_enabled 0
 
 # tracing functionality
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 02cdb9ec7..a68703f09 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -1,19 +1,14 @@
-# This config models the Ampere RTX 3070
-# For more info about Ampere architecture:
-# https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-# https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
-# https://en.wikipedia.org/wiki/GeForce_30_series
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
--gpgpu_ptx_force_max_capability 86 
+-gpgpu_ptx_force_max_capability 86
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency 5000
+-gpgpu_kernel_launch_latency  7872
 -gpgpu_TB_launch_latency 0
 
 # Compute Capability
@@ -30,26 +25,21 @@
 -gpgpu_n_mem 16
 -gpgpu_n_sub_partition_per_mchannel 2
 
-# Ampere clock domains
+# clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1320.0:1320.0:1320.0:3500.0
-# boost mode
-# -gpgpu_clock_domains 1780.0:1780.0:1780.0:3500.0
+-gpgpu_clock_domains 1132:1132:1132:3500.5
 
 # shader core pipeline config
 -gpgpu_shader_registers 65536
 -gpgpu_registers_per_block 65536
 -gpgpu_occupancy_sm_number 86
 
-# This implies a maximum of 64 warps/SM
--gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_core_pipeline 1536:32
 -gpgpu_shader_cta 32
 -gpgpu_simd_model 1
 
 # Pipeline widths and number of FUs
 # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-## Ampere GA102 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
-## we need to scale the number of pipeline registers to be equal to the number of SP units
 -gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
 -gpgpu_num_sp_units 4
 -gpgpu_num_sfu_units 4
@@ -61,18 +51,18 @@
 # Instruction latencies and initiation intervals
 # "ADD,MAX,MUL,MAD,DIV"
 # All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,13,4,5,145,21
--ptx_opcode_initiation_int 2,2,2,2,8,4
--ptx_opcode_latency_fp 4,13,4,5,39
--ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,19,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 100
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
+-ptx_opcode_initiation_fp 1,1,1,1,2
+-ptx_opcode_latency_dp 55,55,55,55,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
--ptx_opcode_latency_tesnor 32
--ptx_opcode_initiation_tensor 32
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
 
-# Ampere has sub core model, in which each scheduler has its own register file and EUs
+# sub core model: in which each scheduler has its own register file and EUs
 # i.e. schedulers are isolated
 -gpgpu_sub_core_model 1
 # disable specialized operand collectors and use generic operand collectors instead
@@ -80,50 +70,47 @@
 -gpgpu_operand_collector_num_units_gen 8
 -gpgpu_operand_collector_num_in_ports_gen 8
 -gpgpu_operand_collector_num_out_ports_gen 8
-# Ampere has 24 double-ported banks, 4 schedulers, 6 banks per scheduler
--gpgpu_num_reg_banks 24
+# register banks
+-gpgpu_num_reg_banks 8
 -gpgpu_reg_file_port_throughput 2
 
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 86
-
-# Ampere has four schedulers per core
+# warp scheduling
 -gpgpu_num_sched_per_core 4
-# Greedy then oldest scheduler
 -gpgpu_scheduler gto
-## In Ampere, a warp scheduler can issue 1 inst per cycle
+# a warp scheduler issue mode
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
-# In Ampere, we assign the remaining shared memory to L1 cache 
-# if the assigned shd mem = 0, then L1 cache = 128KB
-# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-8-x
-# disable this mode in case of multi kernels/apps execution
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
 -gpgpu_shmem_option 0,8,16,32,64,100
 -gpgpu_unified_l1d_size 128
-# Ampere unified cache has four banks
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_l1_cache_write_ratio 25
+-gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32
+-gpgpu_l1_latency 39
 -gpgpu_gmem_skip_L1D 0
--gpgpu_l1_latency 20
--gpgpu_n_cluster_ejection_buffer_size 32
 -gpgpu_flush_l1_cache 1
-# shared memory configuration
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
 -gpgpu_shmem_size 102400
 -gpgpu_shmem_sizeDefault 102400
--gpgpu_shmem_per_block 102400
--gpgpu_smem_latency 20
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 29
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 86
 
-# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 3MB L2 cache
--gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+# L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 1
@@ -133,15 +120,13 @@
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
-# Note, TEX is deprecated, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
 -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
 # 64 KB Const
 -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
-#-network_mode 1 
-#-inter_config_file config_ampere_islip.icnt
 # use built-in local xbar
 -network_mode 2
 -icnt_in_buffer_limit 512
@@ -151,16 +136,15 @@
 -icnt_arbiter_algo 1
 
 # memory partition latency config 
--gpgpu_l2_rop_latency 160
--dram_latency 100
+-gpgpu_l2_rop_latency 187
+-dram_latency 254
 
-# dram model config
+# dram sched config
 -gpgpu_dram_scheduler 1
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# Ampere RTX3060 has GDDR6
-# http://monitorinsider.com/GDDR6.html
+# dram model config
 -gpgpu_n_mem_per_ctrlr 1
 -gpgpu_dram_buswidth 2
 -gpgpu_dram_burst_length 16
@@ -168,9 +152,9 @@
 -gpgpu_mem_address_mask 1
 -gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
 
-# Use the same GDDR5 timing, scaled to 3500MHZ
--gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
-                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
 
 # select lower bits for bnkgrp to increase bnkgrp parallelism
 -dram_bnk_indexing_policy 0
@@ -185,7 +169,7 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Ampere
+# power model configs, disable it untill we create a real energy model
 -power_simulation_enabled 0
 
 # tracing functionality

From 3eea0140bc19dc1822d40e29d1aa55643894c6d3 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 26 May 2021 19:44:28 -0400
Subject: [PATCH 081/133] changing kernel latency

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index f35af1b64..a9943703a 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -8,7 +8,7 @@
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency  7571
+-gpgpu_kernel_launch_latency 5000
 -gpgpu_TB_launch_latency 0
 
 # Compute Capability
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index a68703f09..fda3851d0 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -8,7 +8,7 @@
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency  7872
+-gpgpu_kernel_launch_latency 5000
 -gpgpu_TB_launch_latency 0
 
 # Compute Capability

From 6ad461a95ac71e0597274c4f750ce03bb3a6871e Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Thu, 27 May 2021 15:38:26 -0400
Subject: [PATCH 082/133] fixing configs

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 8 ++++----
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index a9943703a..cc3152c59 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -55,7 +55,7 @@
 -ptx_opcode_initiation_int 2,2,2,2,2
 -ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 54,54,54,54,330
+-ptx_opcode_latency_dp 64,64,64,64,330
 -ptx_opcode_initiation_dp 64,64,64,64,130
 -ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
@@ -87,11 +87,11 @@
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
--gpgpu_shmem_option 0,8,16,32,64,64
--gpgpu_unified_l1d_size 64
+-gpgpu_shmem_option 32,64
+-gpgpu_unified_l1d_size 96
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1 S:4:128:128,L:T:m:L:L,A:256:32,16:0,32
+-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:256:32,16:0,32
 -gpgpu_l1_latency 32
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_flush_l1_cache 1
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index fda3851d0..098cb1d20 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -55,7 +55,7 @@
 -ptx_opcode_initiation_int 2,2,2,2,2
 -ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 1,1,1,1,2
--ptx_opcode_latency_dp 55,55,55,55,330
+-ptx_opcode_latency_dp 64,64,64,64,330
 -ptx_opcode_initiation_dp 64,64,64,64,130
 -ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8

From 110aeb12257b030b32cdc47e4cca0ed1089ac855 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 31 May 2021 15:55:18 -0400
Subject: [PATCH 083/133] rewrite shmem_option parsing

---
 src/gpgpu-sim/shader.h | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 2d2f051b5..4c6de0683 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1498,25 +1498,11 @@ class shader_core_config : public core_config {
 
     // parse gpgpu_shmem_option for adpative cache config
     if (adaptive_cache_config) {
-      for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
-        char option[4];
-        int j = 0;
-        while (gpgpu_shmem_option[i] != ',' && i < strlen(gpgpu_shmem_option)) {
-          if (gpgpu_shmem_option[i] == ' ') {
-            // skip spaces
-            i++;
-          } else {
-            if (!isdigit(gpgpu_shmem_option[i])) {
-              // check for non digits, which should not be here
-              assert(0 && "invalid config: -gpgpu_shmem_option");
-            }
-            option[j] = gpgpu_shmem_option[i];
-            j++;
-            i++;
-          }
-        }
-        // convert KB -> B
-        shmem_opt_list.push_back((unsigned)atoi(option) * 1024);
+      std::stringstream ss(gpgpu_shmem_option);
+      while (ss.good()) {
+        std::string option;
+        std::getline(ss, option, ',');
+        shmem_opt_list.push_back((unsigned)std::stoi(option) * 1024);
       }
       std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
     }

From 04462cbf5b56e0416c3a733b4214351ac227f4c0 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 3 Jun 2021 13:55:44 -0400
Subject: [PATCH 084/133] update readable

---
 src/gpgpu-sim/gpu-cache.cc | 30 +++++++++++++++++++++++++++---
 src/gpgpu-sim/gpu-cache.h  |  2 +-
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 28d3215ae..a35f5022d 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -284,10 +284,12 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       // number of dirty lines / total lines in the cache
       float dirty_line_percentage =
           ((float)m_dirty / (m_config.m_nset * m_config.m_assoc)) * 100;
+      // If the cacheline is from a load op (not modified), 
+      // or the total dirty cacheline is above a specific value,
+      // Then this cacheline is eligible to be considered for replacement candidate
+      // i.e. Only evict clean cachelines until total dirty cachelines reach the limit.
       if (!line->is_modified_line() ||
           dirty_line_percentage >= m_config.m_wr_percent) {
-        // if number of dirty lines in the cache is greater than
-        // a specific value
         all_reserved = false;
         if (line->is_invalid_line()) {
           invalid_line = index;
@@ -354,7 +356,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       if (m_config.m_alloc_policy == ON_MISS) {
         if (m_lines[idx]->is_modified_line()) {
           wb = true;
-          m_lines[idx]->set_byte_mask(mf);
+          // m_lines[idx]->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
                            m_lines[idx]->get_dirty_byte_mask(),
@@ -1191,6 +1193,25 @@ void data_cache::send_write_request(mem_fetch *mf, cache_event request,
   mf->set_status(m_miss_queue_status, time);
 }
 
+void data_cache::update_m_readable(mem_fetch *mf, unsigned cache_index) {
+  cache_block_t *block = m_tag_array->get_block(cache_index);
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+    if (mf->get_access_sector_mask().test(i)) {
+      bool all_set = true;
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        // If any bit in the byte mask (within the sector) is not set, 
+        // the sector is unreadble
+        if (!block->get_dirty_byte_mask().test(k)) {
+          all_set = false;
+          break;
+        }
+      }
+      if (all_set)
+        block->set_m_readable(true, mf->get_access_sector_mask());
+    }
+  }
+}
+
 /****** Write-hit functions (Set by config file) ******/
 
 /// Write-back hit: Mark block as modified
@@ -1207,6 +1228,7 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
+  update_m_readable(mf,cache_index);
 
   return HIT;
 }
@@ -1230,6 +1252,7 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
   }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
+  update_m_readable(mf,cache_index);
 
   // generate a write-through
   send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
@@ -1543,6 +1566,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     if (m_status == HIT_RESERVED)
       block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
+  update_m_readable(mf,cache_index);
 
   if (m_status != RESERVATION_FAIL) {
     // If evicted block is modified and not a write-through
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 7a2a8d94d..67d084cbf 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1570,7 +1570,7 @@ class data_cache : public baseline_cache {
   /// Sends write request to lower level memory (write or writeback)
   void send_write_request(mem_fetch *mf, cache_event request, unsigned time,
                           std::list<cache_event> &events);
-
+  void update_m_readable(mem_fetch *mf, unsigned cache_index);
   // Member Function pointers - Set by configuration options
   // to the functions below each grouping
   /******* Write-hit configs *******/

From e9d781a467dd21c3ec3f1508aede803cb3ffb2c3 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 3 Jun 2021 13:56:04 -0400
Subject: [PATCH 085/133] minor improvements

---
 src/gpgpu-sim/l2cache.cc |  9 +++++----
 src/gpgpu-sim/shader.cc  | 34 ++++++++++++++--------------------
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 57e8ea97c..f1c761fe5 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -716,7 +716,7 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
   if (mf->get_data_size() == SECTOR_SIZE &&
       mf->get_access_sector_mask().count() == 1) {
     result.push_back(mf);
-  } else if (mf->get_data_size() == 128) {
+  } else if (mf->get_data_size() == MAX_MEMORY_ACCESS_SIZE) {
     // break down every sector
     mem_access_byte_mask_t mask;
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
@@ -732,11 +732,12 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
 
       result.push_back(n_mf);
     }
+    // This is for constant cache
   } else if (mf->get_data_size() == 64 &&
-             (mf->get_access_sector_mask().to_string() == "1111" ||
-              mf->get_access_sector_mask().to_string() == "0000")) {
+             (mf->get_access_sector_mask().all() ||
+              mf->get_access_sector_mask().none())) {
     unsigned start;
-    if (mf->get_addr() % 128 == 0)
+    if (mf->get_addr() % MAX_MEMORY_ACCESS_SIZE == 0)
       start = 0;
     else
       start = 2;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c65affdb6..0f6631229 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3344,30 +3344,24 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
     // Unified cache config is in KB. Converting to B
     unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
 
-    switch (adaptive_cache_config) {
-      case FIXED:
-        break;
-      case ADAPTIVE_CACHE: {
-        bool l1d_configured = false;
-        unsigned max_assoc = m_L1D_config.get_max_assoc();
-
-        for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
-             it < shmem_opt_list.end(); it++) {
-          if (total_shmem <= *it) {
-            float l1_ratio = 1 - ((float)*(it) / total_unified);
-            m_L1D_config.set_assoc(max_assoc * l1_ratio);
-            l1d_configured = true;
-            break;
-          }
-        }
-
-        assert(l1d_configured && "no shared memory option found");
+    bool l1d_configured = false;
+    unsigned max_assoc = m_L1D_config.get_max_assoc();
+
+    for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
+         it < shmem_opt_list.end(); it++) {
+      if (total_shmem <= *it) {
+        float l1_ratio = 1 - ((float)*(it) / total_unified);
+        // make sure the ratio is between 0 and 1
+        assert(0 <= l1_ratio && l1_ratio <= 1);
+        // round to nearest instead of round down
+        m_L1D_config.set_assoc(max_assoc * l1_ratio + 0.5f);
+        l1d_configured = true;
         break;
       }
-      default:
-        assert(0);
     }
 
+    assert(l1d_configured && "no shared memory option found");
+
     if (m_L1D_config.is_streaming()) {
       // for streaming cache, if the whole memory is allocated
       // to the L1 cache, then make the allocation to be on_MISS

From 0f088dc11a47cb3de905de3483f6a1c019b7d283 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 16 Jun 2021 10:22:57 -0400
Subject: [PATCH 086/133] correct dirty counter

---
 src/gpgpu-sim/gpu-cache.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index a35f5022d..c93ac5fbc 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -428,9 +428,11 @@ void tag_array::fill(new_addr_type addr, unsigned time,
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  m_lines[index]->fill(time, mf->get_access_sector_mask(),
-                       mf->get_access_byte_mask());
-  m_dirty++;
+  bool before = m_lines[index]->is_modified_line();
+  m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+  if (m_lines[index]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 // TODO: we need write back the flushed data to the upper level

From 3cf24b8afea9a519fa052e68cf10c1f774ab5f68 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 22 Jun 2021 20:27:43 -0400
Subject: [PATCH 087/133] WT in lazy fetch on read

---
 src/gpgpu-sim/gpu-cache.cc | 28 +++++-----------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index c93ac5fbc..7416246f0 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1511,35 +1511,17 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
     std::list<cache_event> &events, enum cache_request_status status) {
   new_addr_type block_addr = m_config.block_addr(addr);
-  new_addr_type mshr_addr = m_config.mshr_addr(mf->get_addr());
 
   // if the request writes to the whole cache line/sector, then, write and set
   // cache line Modified. and no need to send read request to memory or reserve
   // mshr
 
-  // Write allocate, maximum 2 requests (write miss, write back request)
-  // Conservatively ensure the worst-case request can be handled this
-  // cycle
-  if (m_config.m_write_policy == WRITE_THROUGH) {
-    bool mshr_hit = m_mshrs.probe(mshr_addr);
-    bool mshr_avail = !m_mshrs.full(mshr_addr);
-    if (miss_queue_full(1) ||
-        (!(mshr_hit && mshr_avail) &&
-         !(!mshr_hit && mshr_avail &&
-           (m_miss_queue.size() < m_config.m_miss_queue_size)))) {
-      // check what is the exactly the failure reason
-      if (miss_queue_full(1))
-        m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
-      else if (mshr_hit && !mshr_avail)
-        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
-      else if (!mshr_hit && !mshr_avail)
-        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
-      else
-        assert(0);
-
-      return RESERVATION_FAIL;
-    }
+  if (miss_queue_full(0)) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    return RESERVATION_FAIL;  // cannot handle request this cycle
+  }
 
+  if (m_config.m_write_policy == WRITE_THROUGH) {
     send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
   }
 

From b1befa8422493e0deb45811e6b87399355a532ed Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 16 Aug 2021 18:11:30 -0400
Subject: [PATCH 088/133] Adding restricted round robin scheduler

---
 src/gpgpu-sim/shader.cc | 43 ++++++++++++++++++++++++++++++++++++++++-
 src/gpgpu-sim/shader.h  | 28 +++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 0f6631229..7cee40fc9 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -191,6 +191,8 @@ void shader_core_ctx::create_schedulers() {
                 ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
                 : sched_config.find("gto") != std::string::npos
                       ? CONCRETE_SCHEDULER_GTO
+                      : sched_config.find("rrr") != std::string::npos
+                            ? CONCRETE_SCHEDULER_RRR
                       : sched_config.find("old") != std::string::npos
                             ? CONCRETE_SCHEDULER_OLDEST_FIRST
                             : sched_config.find("warp_limiting") !=
@@ -225,6 +227,14 @@ void shader_core_ctx::create_schedulers() {
             &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
             &m_pipeline_reg[ID_OC_MEM], i));
         break;
+      case CONCRETE_SCHEDULER_RRR:
+        schedulers.push_back(new rrr_scheduler(
+            m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
+            &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
+            &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
+            &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
+            &m_pipeline_reg[ID_OC_MEM], i));
+        break;
       case CONCRETE_SCHEDULER_OLDEST_FIRST:
         schedulers.push_back(new oldest_scheduler(
             m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
@@ -1101,6 +1111,33 @@ void scheduler_unit::order_lrr(
   }
 }
 
+template <class T>
+void scheduler_unit::order_rrr(
+    std::vector<T> &result_list, const typename std::vector<T> &input_list,
+    const typename std::vector<T>::const_iterator &last_issued_from_input,
+    unsigned num_warps_to_add) {
+  result_list.clear();
+
+  if (m_num_issued_last_cycle > 0 || warp(m_current_turn_warp).done_exit() ||
+      warp(m_current_turn_warp).waiting()) {
+    std::vector<shd_warp_t *>::const_iterator iter =
+      (last_issued_from_input == input_list.end()) ? 
+        input_list.begin() : last_issued_from_input + 1;
+    for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) {
+      if (iter == input_list.end()) {
+      iter = input_list.begin();
+      }
+      unsigned warp_id = (*iter)->get_warp_id();
+      if (!(*iter)->done_exit() && !(*iter)->waiting()) {
+        result_list.push_back(*iter);
+        m_current_turn_warp = warp_id;
+        break;
+      }
+    }
+  } else {
+    result_list.push_back(&warp(m_current_turn_warp));
+  }
+}
 /**
  * A general function to order things in an priority-based way.
  * The core usage of the function is similar to order_lrr.
@@ -1433,7 +1470,7 @@ void scheduler_unit::cycle() {
           m_last_supervised_issued = supervised_iter;
         }
       }
-
+      m_num_issued_last_cycle = issued;
       if (issued == 1)
         m_stats->single_issue_nums[m_id]++;
       else if (issued > 1)
@@ -1482,6 +1519,10 @@ void lrr_scheduler::order_warps() {
   order_lrr(m_next_cycle_prioritized_warps, m_supervised_warps,
             m_last_supervised_issued, m_supervised_warps.size());
 }
+void rrr_scheduler::order_warps() {
+  order_rrr(m_next_cycle_prioritized_warps, m_supervised_warps,
+            m_last_supervised_issued, m_supervised_warps.size());
+}
 
 void gto_scheduler::order_warps() {
   order_by_priority(m_next_cycle_prioritized_warps, m_supervised_warps,
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 4c6de0683..9cb256a29 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -321,6 +321,7 @@ enum concrete_scheduler {
   CONCRETE_SCHEDULER_LRR = 0,
   CONCRETE_SCHEDULER_GTO,
   CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE,
+  CONCRETE_SCHEDULER_RRR,
   CONCRETE_SCHEDULER_WARP_LIMITING,
   CONCRETE_SCHEDULER_OLDEST_FIRST,
   NUM_CONCRETE_SCHEDULERS
@@ -372,6 +373,12 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
       const typename std::vector<T> &input_list,
       const typename std::vector<T>::const_iterator &last_issued_from_input,
       unsigned num_warps_to_add);
+  template <typename T>
+  void order_rrr(
+      typename std::vector<T> &result_list,
+      const typename std::vector<T> &input_list,
+      const typename std::vector<T>::const_iterator &last_issued_from_input,
+      unsigned num_warps_to_add);
 
   enum OrderingType {
     // The item that issued last is prioritized first then the sorted result
@@ -430,6 +437,8 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
   register_set *m_tensor_core_out;
   register_set *m_mem_out;
   std::vector<register_set *> &m_spec_cores_out;
+  unsigned m_num_issued_last_cycle;
+  unsigned m_current_turn_warp;
 
   int m_id;
 };
@@ -453,6 +462,25 @@ class lrr_scheduler : public scheduler_unit {
   }
 };
 
+class rrr_scheduler : public scheduler_unit {
+ public:
+  rrr_scheduler(shader_core_stats *stats, shader_core_ctx *shader,
+                Scoreboard *scoreboard, simt_stack **simt,
+                std::vector<shd_warp_t *> *warp, register_set *sp_out,
+                register_set *dp_out, register_set *sfu_out,
+                register_set *int_out, register_set *tensor_core_out,
+                std::vector<register_set *> &spec_cores_out,
+                register_set *mem_out, int id)
+      : scheduler_unit(stats, shader, scoreboard, simt, warp, sp_out, dp_out,
+                       sfu_out, int_out, tensor_core_out, spec_cores_out,
+                       mem_out, id) {}
+  virtual ~rrr_scheduler() {}
+  virtual void order_warps();
+  virtual void done_adding_supervised_warps() {
+    m_last_supervised_issued = m_supervised_warps.end();
+  }
+};
+
 class gto_scheduler : public scheduler_unit {
  public:
   gto_scheduler(shader_core_stats *stats, shader_core_ctx *shader,

From b6581477462ea15d92967588277c4fe822a67bf7 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 16 Aug 2021 18:15:20 -0400
Subject: [PATCH 089/133] better oc selecting when sub core enabled

---
 src/gpgpu-sim/shader.cc |  3 +++
 src/gpgpu-sim/shader.h  | 45 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 7cee40fc9..bcfda1867 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3997,6 +3997,9 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
     m_cu[j]->init(j, num_banks, m_bank_warp_shift, shader->get_config(), this,
                   sub_core_model, reg_id, m_num_banks_per_sched);
   }
+  for (unsigned j = 0; j < m_dispatch_units.size(); j++) {
+    m_dispatch_units[j].init(sub_core_model,m_num_warp_scheds);
+  }
   m_initialized = true;
 }
 
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 9cb256a29..f2fac1209 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -950,13 +950,44 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_num_collectors = (*cus).size();
       m_next_cu = 0;
     }
+    void init(bool sub_core_model, unsigned num_warp_scheds) {
+      m_sub_core_model = sub_core_model;
+      m_num_warp_scheds = num_warp_scheds;
+      if (m_sub_core_model) {
+        m_last_cu_set = new unsigned(m_num_warp_scheds);
+        for (unsigned i = 0; i < m_num_warp_scheds; i++)
+        {
+          m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds;
+        }
+      }
+      
+    }
 
     collector_unit_t *find_ready() {
-      for (unsigned n = 0; n < m_num_collectors; n++) {
-        unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        if ((*m_collector_units)[c].ready()) {
-          m_last_cu = c;
-          return &((*m_collector_units)[c]);
+      if (m_sub_core_model) {
+        assert(m_num_collectors % m_num_warp_scheds == 0 &&
+                 m_num_collectors >= m_num_warp_scheds);
+        unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
+        for (unsigned i = 0; i < m_num_warp_scheds; i++) {
+          unsigned cuLowerBound = i * cusPerSched;
+          unsigned cuUpperBound = cuLowerBound + cusPerSched;
+          assert(0 <= cuLowerBound && cuUpperBound <= m_num_collectors);
+          assert(cuLowerBound <= m_last_cu_set[i] && m_last_cu_set[i] <= cuUpperBound);
+          for (unsigned j = cuLowerBound; j < cuUpperBound; j++) {
+            unsigned c = cuLowerBound + (m_last_cu_set[i] + j + 1) % cusPerSched;
+            if ((*m_collector_units)[c].ready()) {
+            m_last_cu_set[i] = c;
+            return &((*m_collector_units)[c]);
+            }
+          }
+        }
+      } else {
+        for (unsigned n = 0; n < m_num_collectors; n++) {
+          unsigned c = (m_last_cu + n + 1) % m_num_collectors;
+          if ((*m_collector_units)[c].ready()) {
+            m_last_cu = c;
+            return &((*m_collector_units)[c]);
+          }
         }
       }
       return NULL;
@@ -966,7 +997,11 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;
     unsigned m_last_cu;  // dispatch ready cu's rr
+    unsigned *m_last_cu_set;
     unsigned m_next_cu;  // for initialization
+
+    bool m_sub_core_model;
+    unsigned m_num_warp_scheds;
   };
 
   // opndcoll_rfu_t data members

From a8256e50a6d25338f659da76ff9c3595132f54b2 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 23 Aug 2021 13:06:13 -0400
Subject: [PATCH 090/133] Update volta to use lrr scheduler

---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config  | 2 +-
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 5f22a42b0..425bc1690 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -99,7 +99,7 @@
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index c44563fb6..0c69c7084 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -100,7 +100,7 @@
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1

From 84c4f46fb78b529ab2447d7a676f5b3ac2d9c05f Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 23 Aug 2021 13:06:54 -0400
Subject: [PATCH 091/133] Ampere and Turing also lrr scheduler

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index cc3152c59..0ae91a50f 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -76,7 +76,7 @@
 
 # warp scheduling
 -gpgpu_num_sched_per_core 4
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 # a warp scheduler issue mode
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 098cb1d20..854378151 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -76,7 +76,7 @@
 
 # warp scheduling
 -gpgpu_num_sched_per_core 4
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 # a warp scheduler issue mode
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1

From 84c6cf45131e42b1a724ebf7977987a9ddb70db9 Mon Sep 17 00:00:00 2001
From: VijayKandiah <vkz4947@peroni.cs.northwestern.edu>
Date: Sun, 17 Oct 2021 02:18:10 -0500
Subject: [PATCH 092/133] AccelWattch dev Integration

---
 CHANGES                                       |    4 +
 COPYRIGHT                                     |   30 +
 Makefile                                      |   16 +-
 README.md                                     |   49 +-
 .../SM6_TITANX/accelwattch_ptx_sim.xml        |  623 +++++++++
 .../SM6_TITANX/accelwattch_ptx_sim_alt.xml    |  623 +++++++++
 .../SM6_TITANX/accelwattch_sass_sim.xml       |  613 +++++++++
 .../SM6_TITANX/accelwattch_sass_sim_alt.xml   |  613 +++++++++
 .../tested-cfgs/SM6_TITANX/gpgpusim.config    |   33 +-
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  |    3 +-
 .../SM75_RTX2060_S/accelwattch_ptx_sim.xml    |  623 +++++++++
 .../accelwattch_ptx_sim_alt.xml               |  623 +++++++++
 .../SM75_RTX2060_S/accelwattch_sass_sim.xml   |  613 +++++++++
 .../accelwattch_sass_sim_alt.xml              |  613 +++++++++
 .../SM75_RTX2060_S/config_turing_islip.icnt   |   73 ++
 .../SM75_RTX2060_S/gpgpusim.config            |  210 +++
 .../SM7_QV100/accelwattch_ptx_sim.xml         |  623 +++++++++
 .../SM7_QV100/accelwattch_ptx_sim_alt.xml     |  623 +++++++++
 .../SM7_QV100/accelwattch_sass_hw.xml         |  613 +++++++++
 .../SM7_QV100/accelwattch_sass_hybrid.xml     |  613 +++++++++
 .../SM7_QV100/accelwattch_sass_sim.xml        |  613 +++++++++
 .../SM7_QV100/accelwattch_sass_sim_alt.xml    |  613 +++++++++
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |   38 +-
 configs/tested-cfgs/SM7_QV100/hw_perf.csv     |   26 +
 .../SM7_TITANV/accelwattch_ptx_sim.xml        |  623 +++++++++
 .../SM7_TITANV/accelwattch_ptx_sim_alt.xml    |  623 +++++++++
 .../SM7_TITANV/accelwattch_sass_hw.xml        |  613 +++++++++
 .../SM7_TITANV/accelwattch_sass_hybrid.xml    |  613 +++++++++
 .../SM7_TITANV/accelwattch_sass_sim.xml       |  613 +++++++++
 .../SM7_TITANV/accelwattch_sass_sim_alt.xml   |  613 +++++++++
 .../tested-cfgs/SM7_TITANV/gpgpusim.config    |    3 -
 format-code.sh                                |    4 +-
 setup_environment                             |   14 +-
 src/abstract_hardware_model.cc                |   28 +-
 src/abstract_hardware_model.h                 |   69 +-
 src/{gpuwattch => accelwattch}/Alpha21364.xml |    0
 src/{gpuwattch => accelwattch}/Niagara1.xml   |    0
 .../Niagara1_sharing.xml                      |    0
 .../Niagara1_sharing_DC.xml                   |    0
 .../Niagara1_sharing_SBT.xml                  |    0
 .../Niagara1_sharing_ST.xml                   |    0
 src/{gpuwattch => accelwattch}/Niagara2.xml   |    0
 src/{gpuwattch => accelwattch}/Penryn.xml     |    0
 src/{gpuwattch => accelwattch}/README         |    0
 src/{gpuwattch => accelwattch}/XML_Parse.cc   |  361 +++++-
 src/{gpuwattch => accelwattch}/XML_Parse.h    |   60 +-
 src/{gpuwattch => accelwattch}/Xeon.xml       |    0
 src/{gpuwattch => accelwattch}/arch_const.h   |    0
 src/{gpuwattch => accelwattch}/array.cc       |    0
 src/{gpuwattch => accelwattch}/array.h        |    0
 .../basic_components.cc                       |    0
 .../basic_components.h                        |    0
 src/{gpuwattch => accelwattch}/cacti/README   |    0
 .../cacti/Ucache.cc                           |    4 +-
 src/{gpuwattch => accelwattch}/cacti/Ucache.h |    0
 .../cacti/arbiter.cc                          |    0
 .../cacti/arbiter.h                           |    0
 src/{gpuwattch => accelwattch}/cacti/area.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/area.h   |    0
 src/{gpuwattch => accelwattch}/cacti/bank.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/bank.h   |    0
 .../cacti/basic_circuit.cc                    |    0
 .../cacti/basic_circuit.h                     |    0
 .../cacti/batch_tests                         |    0
 .../cacti/cache.cfg                           |    0
 src/{gpuwattch => accelwattch}/cacti/cacti.i  |    0
 src/{gpuwattch => accelwattch}/cacti/cacti.mk |    2 +-
 .../cacti/cacti_interface.cc                  |    0
 .../cacti/cacti_interface.h                   |    0
 .../cacti/component.cc                        |    0
 .../cacti/component.h                         |    0
 src/{gpuwattch => accelwattch}/cacti/const.h  |    0
 .../cacti/contention.dat                      |    0
 .../cacti/crossbar.cc                         |    0
 .../cacti/crossbar.h                          |    0
 .../cacti/decoder.cc                          |    0
 .../cacti/decoder.h                           |    0
 .../cacti/highradix.cc                        |    0
 .../cacti/highradix.h                         |    0
 .../cacti/htree2.cc                           |    0
 src/{gpuwattch => accelwattch}/cacti/htree2.h |    0
 src/{gpuwattch => accelwattch}/cacti/io.cc    |    0
 src/{gpuwattch => accelwattch}/cacti/io.h     |    0
 src/{gpuwattch => accelwattch}/cacti/main.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/makefile |    0
 src/{gpuwattch => accelwattch}/cacti/mat.cc   |    0
 src/{gpuwattch => accelwattch}/cacti/mat.h    |    0
 src/{gpuwattch => accelwattch}/cacti/nuca.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/nuca.h   |    0
 .../cacti/out_batch_test_result.csv           |    0
 .../cacti/parameter.cc                        |    0
 .../cacti/parameter.h                         |    0
 .../cacti/router.cc                           |    0
 src/{gpuwattch => accelwattch}/cacti/router.h |    0
 .../cacti/subarray.cc                         |    0
 .../cacti/subarray.h                          |    0
 .../cacti/technology.cc                       |    0
 src/{gpuwattch => accelwattch}/cacti/uca.cc   |    0
 src/{gpuwattch => accelwattch}/cacti/uca.h    |    0
 src/{gpuwattch => accelwattch}/cacti/wire.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/wire.h   |    0
 src/{gpuwattch => accelwattch}/core.cc        |    0
 src/{gpuwattch => accelwattch}/core.h         |    0
 src/{gpuwattch => accelwattch}/fermi.xml      |    0
 src/{gpuwattch => accelwattch}/globalvar.h    |    0
 src/{gpuwattch => accelwattch}/gpgpu.xml      |    0
 .../gpgpu_sim.verify                          |    0
 src/accelwattch/gpgpu_sim_wrapper.cc          | 1143 +++++++++++++++++
 .../gpgpu_sim_wrapper.h                       |   81 +-
 .../gpgpu_static.xml                          |    0
 .../interconnect.cc                           |    0
 src/{gpuwattch => accelwattch}/interconnect.h |    0
 .../iocontrollers.cc                          |    0
 .../iocontrollers.h                           |    0
 src/{gpuwattch => accelwattch}/logic.cc       |    0
 src/{gpuwattch => accelwattch}/logic.h        |    0
 src/{gpuwattch => accelwattch}/main.cc        |    0
 src/{gpuwattch => accelwattch}/makefile       |    0
 src/{gpuwattch => accelwattch}/mcpat.mk       |    2 +-
 .../mcpatXeonCore.mk                          |    0
 src/{gpuwattch => accelwattch}/memoryctrl.cc  |    0
 src/{gpuwattch => accelwattch}/memoryctrl.h   |    0
 src/{gpuwattch => accelwattch}/noc.cc         |    0
 src/{gpuwattch => accelwattch}/noc.h          |    0
 src/{gpuwattch => accelwattch}/processor.cc   |   12 +-
 src/{gpuwattch => accelwattch}/processor.h    |    0
 src/{gpuwattch => accelwattch}/quadro.xml     |    0
 .../results/Alpha21364                        |    0
 .../results/Alpha21364_90nm                   |    0
 src/{gpuwattch => accelwattch}/results/Penryn |    0
 src/{gpuwattch => accelwattch}/results/T1     |    0
 .../results/T1_DC_64                          |    0
 .../results/T1_SBT_64                         |    0
 .../results/T1_ST_64                          |    0
 src/{gpuwattch => accelwattch}/results/T2     |    0
 .../results/Xeon_core                         |    0
 .../results/Xeon_uncore                       |    0
 src/{gpuwattch => accelwattch}/sharedcache.cc |    0
 src/{gpuwattch => accelwattch}/sharedcache.h  |    0
 .../technology_xeon_core.cc                   |    0
 src/{gpuwattch => accelwattch}/version.h      |    0
 src/{gpuwattch => accelwattch}/xmlParser.cc   |    0
 src/{gpuwattch => accelwattch}/xmlParser.h    |    0
 src/cuda-sim/cuda-sim.cc                      |  203 ++-
 src/cuda-sim/instructions.cc                  |   33 +-
 src/cuda-sim/ptx.l                            |   67 +-
 src/cuda-sim/ptx_ir.cc                        |   25 +-
 src/gpgpu-sim/dram.cc                         |   26 +-
 src/gpgpu-sim/dram.h                          |   25 +-
 src/gpgpu-sim/gpu-cache.cc                    |   22 +-
 src/gpgpu-sim/gpu-cache.h                     |   21 +-
 src/gpgpu-sim/gpu-sim.cc                      |  170 ++-
 src/gpgpu-sim/gpu-sim.h                       |   63 +-
 src/gpgpu-sim/l2cache.cc                      |   30 +-
 src/gpgpu-sim/l2cache.h                       |   23 +-
 src/gpgpu-sim/power_interface.cc              |  456 ++++++-
 src/gpgpu-sim/power_interface.h               |   35 +-
 src/gpgpu-sim/power_stat.cc                   |  467 ++++---
 src/gpgpu-sim/power_stat.h                    |  832 ++++++++----
 src/gpgpu-sim/shader.cc                       |   94 +-
 src/gpgpu-sim/shader.h                        |  423 ++++--
 src/gpgpu-sim/stat-tool.cc                    |    2 -
 src/gpgpu-sim/stat-tool.h                     |    2 +
 src/gpuwattch/gpgpu_sim_wrapper.cc            |  863 -------------
 version                                       |    2 +-
 165 files changed, 16621 insertions(+), 1868 deletions(-)
 create mode 100644 configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/hw_perf.csv
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
 rename src/{gpuwattch => accelwattch}/Alpha21364.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1_sharing.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1_sharing_DC.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1_sharing_SBT.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1_sharing_ST.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara2.xml (100%)
 rename src/{gpuwattch => accelwattch}/Penryn.xml (100%)
 rename src/{gpuwattch => accelwattch}/README (100%)
 rename src/{gpuwattch => accelwattch}/XML_Parse.cc (92%)
 rename src/{gpuwattch => accelwattch}/XML_Parse.h (89%)
 rename src/{gpuwattch => accelwattch}/Xeon.xml (100%)
 rename src/{gpuwattch => accelwattch}/arch_const.h (100%)
 rename src/{gpuwattch => accelwattch}/array.cc (100%)
 rename src/{gpuwattch => accelwattch}/array.h (100%)
 rename src/{gpuwattch => accelwattch}/basic_components.cc (100%)
 rename src/{gpuwattch => accelwattch}/basic_components.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/README (100%)
 rename src/{gpuwattch => accelwattch}/cacti/Ucache.cc (99%)
 rename src/{gpuwattch => accelwattch}/cacti/Ucache.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/arbiter.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/arbiter.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/area.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/area.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/bank.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/bank.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/basic_circuit.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/basic_circuit.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/batch_tests (100%)
 rename src/{gpuwattch => accelwattch}/cacti/cache.cfg (100%)
 rename src/{gpuwattch => accelwattch}/cacti/cacti.i (100%)
 rename src/{gpuwattch => accelwattch}/cacti/cacti.mk (96%)
 rename src/{gpuwattch => accelwattch}/cacti/cacti_interface.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/cacti_interface.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/component.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/component.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/const.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/contention.dat (100%)
 rename src/{gpuwattch => accelwattch}/cacti/crossbar.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/crossbar.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/decoder.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/decoder.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/highradix.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/highradix.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/htree2.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/htree2.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/io.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/io.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/main.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/makefile (100%)
 rename src/{gpuwattch => accelwattch}/cacti/mat.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/mat.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/nuca.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/nuca.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/out_batch_test_result.csv (100%)
 rename src/{gpuwattch => accelwattch}/cacti/parameter.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/parameter.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/router.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/router.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/subarray.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/subarray.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/technology.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/uca.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/uca.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/wire.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/wire.h (100%)
 rename src/{gpuwattch => accelwattch}/core.cc (100%)
 rename src/{gpuwattch => accelwattch}/core.h (100%)
 rename src/{gpuwattch => accelwattch}/fermi.xml (100%)
 rename src/{gpuwattch => accelwattch}/globalvar.h (100%)
 rename src/{gpuwattch => accelwattch}/gpgpu.xml (100%)
 rename src/{gpuwattch => accelwattch}/gpgpu_sim.verify (100%)
 create mode 100644 src/accelwattch/gpgpu_sim_wrapper.cc
 rename src/{gpuwattch => accelwattch}/gpgpu_sim_wrapper.h (68%)
 rename src/{gpuwattch => accelwattch}/gpgpu_static.xml (100%)
 rename src/{gpuwattch => accelwattch}/interconnect.cc (100%)
 rename src/{gpuwattch => accelwattch}/interconnect.h (100%)
 rename src/{gpuwattch => accelwattch}/iocontrollers.cc (100%)
 rename src/{gpuwattch => accelwattch}/iocontrollers.h (100%)
 rename src/{gpuwattch => accelwattch}/logic.cc (100%)
 rename src/{gpuwattch => accelwattch}/logic.h (100%)
 rename src/{gpuwattch => accelwattch}/main.cc (100%)
 rename src/{gpuwattch => accelwattch}/makefile (100%)
 rename src/{gpuwattch => accelwattch}/mcpat.mk (97%)
 rename src/{gpuwattch => accelwattch}/mcpatXeonCore.mk (100%)
 rename src/{gpuwattch => accelwattch}/memoryctrl.cc (100%)
 rename src/{gpuwattch => accelwattch}/memoryctrl.h (100%)
 rename src/{gpuwattch => accelwattch}/noc.cc (100%)
 rename src/{gpuwattch => accelwattch}/noc.h (100%)
 rename src/{gpuwattch => accelwattch}/processor.cc (99%)
 rename src/{gpuwattch => accelwattch}/processor.h (100%)
 rename src/{gpuwattch => accelwattch}/quadro.xml (100%)
 rename src/{gpuwattch => accelwattch}/results/Alpha21364 (100%)
 rename src/{gpuwattch => accelwattch}/results/Alpha21364_90nm (100%)
 rename src/{gpuwattch => accelwattch}/results/Penryn (100%)
 rename src/{gpuwattch => accelwattch}/results/T1 (100%)
 rename src/{gpuwattch => accelwattch}/results/T1_DC_64 (100%)
 rename src/{gpuwattch => accelwattch}/results/T1_SBT_64 (100%)
 rename src/{gpuwattch => accelwattch}/results/T1_ST_64 (100%)
 rename src/{gpuwattch => accelwattch}/results/T2 (100%)
 rename src/{gpuwattch => accelwattch}/results/Xeon_core (100%)
 rename src/{gpuwattch => accelwattch}/results/Xeon_uncore (100%)
 rename src/{gpuwattch => accelwattch}/sharedcache.cc (100%)
 rename src/{gpuwattch => accelwattch}/sharedcache.h (100%)
 rename src/{gpuwattch => accelwattch}/technology_xeon_core.cc (100%)
 rename src/{gpuwattch => accelwattch}/version.h (100%)
 rename src/{gpuwattch => accelwattch}/xmlParser.cc (100%)
 rename src/{gpuwattch => accelwattch}/xmlParser.h (100%)
 delete mode 100644 src/gpuwattch/gpgpu_sim_wrapper.cc

diff --git a/CHANGES b/CHANGES
index 7964153c0..5d1cd1082 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,8 @@
 LOG:
+Version 4.2.0 vs 4.1.0 
+- Added AccelWattch power model v1.0 which replaces GPUWattch. 
+- Added AccelWattch XML configuration files for SM7_QV100, SM7_TITANV, SM75_RTX2060_S, SM6_TITANX. Note that all these AccelWattch XML configuration files are tuned only for SM7_QV100. 
+
 Version 4.1.0 versus 4.0.0
 -Features:
 1- Supporting L1 write-allocate with sub-sector writing policy as in Volta+ hardware, and changing the Volta+ cards config to make L1 write-allocate with write-through
diff --git a/COPYRIGHT b/COPYRIGHT
index a4eea2915..1c949f93e 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -44,3 +44,33 @@ per UBC policy 88, item 2.3 on literary works) these students names appear in
 the copyright notices of the respective files. UBC is also mentioned in the 
 copyright notice to highlight that was the author's affiliation when the work 
 was performed.
+
+NOTE 3: AccelWattch and all its components are covered by the following license and copyright.
+Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/Makefile b/Makefile
index d248211cd..82ea39928 100644
--- a/Makefile
+++ b/Makefile
@@ -87,7 +87,7 @@ ifneq ($(GPGPUSIM_POWER_MODEL),)
 		MCPAT_DBG_FLAG = dbg
 	endif
 
-	MCPAT_OBJ_DIR = $(SIM_OBJ_FILES_DIR)/gpuwattch
+	MCPAT_OBJ_DIR = $(SIM_OBJ_FILES_DIR)/accelwattch
 
 	MCPAT = $(MCPAT_OBJ_DIR)/*.o
 endif
@@ -117,24 +117,24 @@ check_setup_environment:
 	 fi 
 
 check_power:
-	@if [ -d "$(GPGPUSIM_ROOT)/src/gpuwattch/" -a ! -n "$(GPGPUSIM_POWER_MODEL)" ]; then \
+	@if [ -d "$(GPGPUSIM_ROOT)/src/accelwattch/" -a ! -n "$(GPGPUSIM_POWER_MODEL)" ]; then \
 		echo ""; \
-		echo "	Power model detected in default directory ($(GPGPUSIM_ROOT)/src/gpuwattch) but GPGPUSIM_POWER_MODEL not set."; \
-		echo "	Please re-run setup_environment or manually set GPGPUSIM_POWER_MODEL to the gpuwattch directory if you would like to include the GPGPU-Sim Power Model."; \
+		echo "	Power model detected in default directory ($(GPGPUSIM_ROOT)/src/accelwattch) but GPGPUSIM_POWER_MODEL not set."; \
+		echo "	Please re-run setup_environment or manually set GPGPUSIM_POWER_MODEL to the accelwattch directory if you would like to include the GPGPU-Sim Power Model."; \
 		echo ""; \
 		true; \
 	elif [ ! -d "$(GPGPUSIM_POWER_MODEL)" ]; then \
 		echo ""; \
 		echo "ERROR ** Power model directory invalid."; \
 		echo "($(GPGPUSIM_POWER_MODEL)) is not a valid directory."; \
-		echo "Please set GPGPUSIM_POWER_MODEL to the GPGPU-Sim gpuwattch directory."; \
+		echo "Please set GPGPUSIM_POWER_MODEL to the GPGPU-Sim accelwattch directory."; \
 		echo ""; \
 		exit 101; \
 	elif [ -n "$(GPGPUSIM_POWER_MODEL)" -a ! -f "$(GPGPUSIM_POWER_MODEL)/gpgpu_sim.verify" ]; then \
 		echo ""; \
 		echo "ERROR ** Power model directory invalid."; \
 		echo "gpgpu_sim.verify not found in $(GPGPUSIM_POWER_MODEL)."; \
-		echo "Please ensure that GPGPUSIM_POWER_MODEL points to a valid gpuwattch directory and that you have the correct GPGPU-Sim mcpat distribution."; \
+		echo "Please ensure that GPGPUSIM_POWER_MODEL points to a valid accelwattch directory and that you have the correct GPGPU-Sim mcpat distribution."; \
 		echo ""; \
 		exit 102; \
 	fi
@@ -243,8 +243,8 @@ makedirs:
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/libopencl/bin ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/libopencl/bin; fi;
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/$(INTERSIM) ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/$(INTERSIM); fi;
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/cuobjdump_to_ptxplus ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/cuobjdump_to_ptxplus; fi;
-	if [ ! -d $(SIM_OBJ_FILES_DIR)/gpuwattch ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/gpuwattch; fi;
-	if [ ! -d $(SIM_OBJ_FILES_DIR)/gpuwattch/cacti ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/gpuwattch/cacti; fi;
+	if [ ! -d $(SIM_OBJ_FILES_DIR)/accelwattch ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/accelwattch; fi;
+	if [ ! -d $(SIM_OBJ_FILES_DIR)/accelwattch/cacti ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/accelwattch/cacti; fi;
 
 all:
 	$(MAKE) gpgpusim
diff --git a/README.md b/README.md
index 9bb891659..da0893585 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 Welcome to GPGPU-Sim, a cycle-level simulator modeling contemporary graphics
 processing units (GPUs) running GPU computing workloads written in CUDA or
 OpenCL. Also included in GPGPU-Sim is a performance visualization tool called
-AerialVision and a configurable and extensible energy model called GPUWattch.
-GPGPU-Sim and GPUWattch have been rigorously validated with performance and
+AerialVision and a configurable and extensible power model called AccelWattch.
+GPGPU-Sim and AccelWattch have been rigorously validated with performance and
 power measurements of real hardware GPUs.
 
 This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
@@ -38,12 +38,11 @@ Md Aamir Raihan, Negar Goli, Tor Aamodt,
 Modeling Deep Learning Accelerator Enabled GPUs, arXiv:1811.08309, 
 https://arxiv.org/abs/1811.08309
 
-If you use the GPUWattch energy model in your research, please cite:
+If you use the AccelWattch power model in your research, please cite:
 
-Jingwen Leng, Tayler Hetherington, Ahmed ElTantawy, Syed Gilani, Nam Sung Kim,
-Tor M. Aamodt, Vijay Janapa Reddi, GPUWattch: Enabling Energy Optimizations in
-GPGPUs, In proceedings of the ACM/IEEE International Symposium on Computer
-Architecture (ISCA 2013), Tel-Aviv, Israel, June 23-27, 2013.
+Vijay Kandiah, Scott Peverelle, Mahmoud Khairy, Junrui Pan, Amogh Manjunath, Timothy G. Rogers, Tor M. Aamodt, and Nikos Hardavellas. 2021.
+AccelWattch: A Power Modeling Framework for Modern GPUs. In MICRO54: 54th Annual IEEE/ACM International Symposium on Microarchitecture
+(MICRO ’21), October 18–22, 2021, Virtual Event, Greece.
 
 If you use the support for CUDA dynamic parallelism in your research, please cite:
 
@@ -62,8 +61,8 @@ This file contains instructions on installing, building and running GPGPU-Sim.
 Detailed documentation on what GPGPU-Sim models, how to configure it, and a
 guide to the source code can be found here: <http://gpgpu-sim.org/manual/>.
 Instructions for building doxygen source code documentation are included below.
-Detailed documentation on GPUWattch including how to configure it and a guide
-to the source code can be found here: <http://gpgpu-sim.org/gpuwattch/>.
+
+Previous versions of GPGPU-Sim (3.2.0 to 4.1.0) included the [GPUWattch Energy model](http://gpgpu-sim.org/gpuwattch/) which has been replaced by AccelWattch version 1.0 in GPGPU-Sim version 4.2.0. AccelWattch supports modern GPUs and is validated against a NVIDIA Volta QV100 GPU. Detailed documentation on AccelWattch can be found here: [AccelWattch Overview](https://github.com/VijayKandiah/accel-sim-framework#accelwattch-overview) and [AccelWattch MICRO'21 Artifact Manual](https://github.com/VijayKandiah/accel-sim-framework/blob/release/AccelWattch.md).
 
 If you have questions, please sign up for the google groups page (see
 gpgpu-sim.org), but note that use of this simulator does not imply any level of
@@ -108,21 +107,20 @@ library (part of the CUDA toolkit). Code to interface with the CUDA Math
 library is contained in cuda-math.h, which also includes several structures
 derived from vector_types.h (one of the CUDA header files).
 
-## GPUWattch Energy Model
+## AccelWattch Power Model
 
-GPUWattch (introduced in GPGPU-Sim 3.2.0) was developed by researchers at the
-University of British Columbia, the University of Texas at Austin, and the
-University of Wisconsin-Madison. Contributors to GPUWattch include Tor
-Aamodt's research group at the University of British Columbia: Tayler
-Hetherington and Ahmed ElTantawy; Vijay Reddi's research group at the
-University of Texas at Austin: Jingwen Leng; and Nam Sung Kim's research group
-at the University of Wisconsin-Madison: Syed Gilani.
+AccelWattch (introduced in GPGPU-Sim 4.2.0) was developed by researchers at 
+Northwestern University, Purdue University, and the University of British Columbia. 
+Contributors to AccelWattch include Nikos Hardavellas's research group at Northwestern University: 
+Vijay Kandiah; Tor Aamodt's research group at the University of British Columbia: Scott Peverelle; 
+and Timothy Rogers's research group at Purdue University: Mahmoud Khairy, Junrui Pan, and Amogh Manjunath. 
 
-GPUWattch leverages McPAT, which was developed by Sheng Li et al. at the
+AccelWattch leverages McPAT, which was developed by Sheng Li et al. at the
 University of Notre Dame, Hewlett-Packard Labs, Seoul National University, and
-the University of California, San Diego. The paper can be found at
+the University of California, San Diego. The McPAT paper can be found at
 http://www.hpl.hp.com/research/mcpat/micro09.pdf.
 
+
 # INSTALLING, BUILDING and RUNNING GPGPU-Sim
 
 Assuming all dependencies required by GPGPU-Sim are installed on your system,
@@ -316,15 +314,16 @@ need to re-compile your application simply to run it on GPGPU-Sim.
 To revert back to running on the hardware, remove GPGPU-Sim from your
 LD_LIBRARY_PATH environment variable.
 
-The following GPGPU-Sim configuration options are used to enable GPUWattch
+The following GPGPU-Sim configuration options are used to enable AccelWattch
 
 	-power_simulation_enabled 1 (1=Enabled, 0=Not enabled)
-	-gpuwattch_xml_file <filename>.xml
-
+	-power_simulation_mode 0 (0=AccelWattch_SASS_SIM or AccelWattch_PTX_SIM, 1=AccelWattch_SASS_HW, 2=AccelWattch_SASS_HYBRID)
+	-accelwattch_xml_file <filename>.xml
 
-The GPUWattch XML configuration file name is set to gpuwattch.xml by default and
-currently only supplied for GTX480 (default=gpuwattch_gtx480.xml). Please refer to
-<http://gpgpu-sim.org/gpuwattch/> for more information.
+The AccelWattch XML configuration file name is set to accelwattch_sass_sim.xml by default and is
+currently provided for SM7_QV100, SM7_TITANV, SM75_RTX2060_S, and SM6_TITANX. 
+Note that all these AccelWattch XML configuration files are tuned only for SM7_QV100. Please refer to
+<https://github.com/VijayKandiah/accel-sim-framework#accelwattch-overview> for more information.
 
 Running OpenCL applications is identical to running CUDA applications. However,
 OpenCL applications need to communicate with the NVIDIA driver in order to
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
index 5b243a5b6..652f0a09e 100644
--- a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
+++ b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
@@ -1,3 +1,32 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
 # This config models the Pascal GP102 (NVIDIA TITAN X)
 # For more info about this card, see Nvidia White paper
 # http://international.download.nvidia.com/geforce-com/international/pdfs/GeForce_GTX_1080_Whitepaper_FINAL.pdf
@@ -28,6 +57,7 @@
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
 -gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
 # Pascal clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
@@ -170,11 +200,8 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Pascal 102
--power_simulation_enabled 0
 
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
 #-trace_sampling_core 0
-
diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 0ae91a50f..2a9bff015 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -175,5 +175,4 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt b/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
new file mode 100644
index 000000000..eed1c34b6
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
@@ -0,0 +1,73 @@
+//52*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 52;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 64;
+input_buffer_size = 256;
+ejection_buffer_size = 64;
+boundary_buffer_size = 64;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
new file mode 100644
index 000000000..0fb4742e1
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -0,0 +1,210 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# This config models the Turing RTX 2060 Super
+# For more info about turing architecture:
+# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
+# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 75
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 5
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 34
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1905.0:1905.0:1905.0:3500.0
+# boost mode
+# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 75
+
+# This implies a maximum of 32 warps/SM
+-gpgpu_shader_core_pipeline 1024:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
+## We need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,32
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Turing has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Turing, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
+
+# Trung has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+-gpgpu_adaptive_cache_config 0
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_shmem_size 65536
+-gpgpu_shmem_sizeDefault 65536
+-gpgpu_shmem_per_block 65536
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 20
+-gpgpu_smem_latency 20
+-gpgpu_flush_l1_cache 1
+
+# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 4MB L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 0
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_turing_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_arbiter_algo 1
+-icnt_flit_size 40
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# Turing has GDDR6
+# http://monitorinsider.com/GDDR6.html
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Use the same GDDR5 timing, scaled to 3500MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
+                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 425bc1690..76c99b7d6 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -1,4 +1,34 @@
-# This config models the Volta
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
 # For more info about volta architecture:
 # http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
 # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
@@ -34,10 +64,11 @@
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 32
 -gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
 # volta clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
 # boost mode
 # -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
 
@@ -199,9 +230,6 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
diff --git a/configs/tested-cfgs/SM7_QV100/hw_perf.csv b/configs/tested-cfgs/SM7_QV100/hw_perf.csv
new file mode 100644
index 000000000..aa88bb256
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/hw_perf.csv
@@ -0,0 +1,26 @@
+Benchmark,Kernel,L1_RH,L1_RM,L1_WH,L1_WM,CC_ACC,SHRD_ACC,DRAM_Rd,DRAM_Wr,L2_RH,L2_RM,L2_WH,L2_WM,NOC,Pipeline_Duty,Num_Idle_SMs,Elapsed_Cycles,Chip Voltage
+b+tree-rodinia-3.1,findRangeK,1634256.0,561818.0,40785.0,19032.0,0.0,0.0,259346.0,3524.0,396522.0,259508.0,60000.0,0.0,1343246.0,0.3268163900773488,5.064000000000002,66542.7,1.0
+b+tree-rodinia-3.1,findK,1318908.0,525035.0,42619.0,7404.0,0.0,0.0,255317.0,2582.0,366918.0,255364.0,50000.0,0.0,1250108.0,0.2740918672650619,3.191999999999995,80883.0,1.0
+backprop-rodinia-3.1,_Z22bpnn_layerforward_CUDAPfS_S_S_ii,49152.0,143738.0,192432.0,4232.0,0.0,413696.0,147464.0,60097.0,29059.0,147460.0,196608.0,0.0,704512.0,0.5619432556155418,7.520000000000007,23324.775,1.0
+backprop-rodinia-3.1,_Z24bpnn_adjust_weights_cudaPfiS_iS_S_,465990.0,277805.0,327015.0,887.0,0.0,0.0,286738.0,190646.0,54315.0,286734.0,327686.0,0.0,1263518.0,0.20116733697224465,9.496000000000002,32578.425,1.0
+hotspot-rodinia-3.1,_Z14calculate_tempiPfS_S_iiiiffffff,4250.0,691050.0,0.0,175104.0,0.0,997428.0,262147.0,66263.0,486965.0,262144.0,175104.0,0.0,1732988.0,0.9470499252952201,3.3200000000000074,56438.825,1.0
+kmeans-rodinia-3.1,_Z11kmeansPointPfiiiPiS_S_S0_,0.0,0.0,0.0,102400.0,4352107.0,0.0,12302960.0,92472.5,6742186.0,12321532.0,102400.0,0.0,26022036.0,0.11420395712434231,1.5799999999999947,894550.775,1.0
+srad_v1-rodinia-3.1,_Z4sradfiilPiS_S_S_PfS0_S0_S0_fS0_S0_,158304.87000000002,89035.40999999999,0.0,143700.0,0.0,0.0,28986.500000000033,45424.200000000026,68135.7,28984.00000000001,143700.0,0.0,481258.2600000001,0.5320091849844065,15.272880000000004,14251.741749999997,1.0
+parboil-sad,_Z11mb_sad_calcPtS_ii,101840.0,415925.0,2102177.0,7289373.0,0.0,10033920.0,257308.0,8720433.0,8754664.0,257280.0,9390720.0,0.0,36398656.0,0.25130932753519797,0.19199999999999662,6551129.125,1.0
+parboil-sgemm,_Z9mysgemmNTPKfiS0_iPfiiff,7109956.0,2452728.0,133388.0,1284.0,0.0,8642304.0,393092.0,36894.0,2059512.0,393088.0,135168.0,0.0,5176696.0,0.5495706862295477,1.8799999999999972,358744.025,1.0
+parboil-mri-q,_Z12ComputeQ_GPUiiPfS_S_S_S_,0.0,163840.0,65184.0,154.0,17617612.5,0.0,164356.0,0.0,0.0,163840.0,65536.0,0.0,458752.0,0.5767256645623982,12.363999999999997,691892.925,1.0
+dct8x8,_Z14CUDAkernel1DCTPfiiiy,0.0,0.0,552.8,32121.9,786431.9999999999,114688.00000000001,32786.0,0.0,16383.999999999998,32767.999999999996,32767.999999999996,0.0,131071.99999999999,0.06091433507559575,7.7799999999999985,24207.632500000003,1.0
+dct8x8,_Z14CUDAkernel2DCTPfS_i,0.0,32768.00000000002,0.0,32768.00000000002,0.0,49152.00000000004,32773.25742574254,0.0,0.0,32768.00000000002,32768.00000000002,0.0,131072.0000000001,0.14345732731755537,30.750257425742568,5822.941584158416,1.0
+binomialOptions,_Z21binomialOptionsKernelv,0.0,0.0,0.0,1024.0,23688.0,16778240.0,640.0,0.0,0.0,0.0,1024.0,0.0,2048.0,0.6457304629145744,1.9519999999999982,1366301.225,1.0
+fastWalshTransform,_Z15fwtBatch2KernelPfS_i,0.0,1048576.0000000002,774120.4444444445,271536.22222222225,0.0,0.0,1048581.888888889,945003.222222222,0.0,1048576.0000000002,1048576.0000000002,0.0,4194304.000000001,0.0867005928407203,2.574222222222223,120947.73472222223,1.0
+fastWalshTransform,_Z15fwtBatch1KernelPfS_i,0.0,1048576.0,645060.0,403890.6666666666,0.0,3407872.0,1048581.0,950303.3333333333,0.0,1048576.0,1048576.0,0.0,4194304.0,0.3836524328760675,2.621333333333329,149487.8,1.0
+histogram,_Z17histogram64KernelPjP5uint4j,0.0,2097152.0,0.0,34960.0,0.0,4893504.000000001,2097184.2941176468,26959.294117647052,0.0,2097152.0,34960.0,0.0,4264223.999999999,0.3361853461559831,3.706823529411762,146480.14411764703,1.0
+mergeSort,_Z21mergeSortSharedKernelILj1EEvPjS0_S0_S0_j,0.0,1048576.0,0.0,1048576.0,0.0,12976128.0,1048580.0,950169.0,0.0,1048576.0,1048576.0,0.0,4194304.0,0.9137102229423307,1.1600000000000055,439316.525,1.0
+mergeSort,_Z30mergeElementaryIntervalsKernelILj1EEvPjS0_S0_S0_S0_S0_jj,152481.75,1127706.3333333333,439852.24999999994,829969.9166666665,0.0,3670010.1666666665,1056772.0000000002,959704.0833333334,199523.16666666666,1056768.0,1269875.1666666667,0.0,4878632.833333334,0.44812863772322986,1.6420000000000003,157457.05,1.0
+quasirandomGenerator,_Z26quasirandomGeneratorKernelPfjj,0.0,0.0,0.0,393215.9999999999,47616.000000000015,0.0,21.0,294938.38095238095,0.0,0.0,393215.9999999999,0.0,786431.9999999998,0.6109600290450061,17.68266666666667,80626.8130952381,1.0
+quasirandomGenerator,_Z16inverseCNDKernelPfPjj,0.0,0.0,0.0,393215.9999999999,0.0,0.0,5.952380952380952,294941.6666666666,0.0,0.0,393215.9999999999,0.0,786431.9999999998,0.307434624439692,5.790476190476192,58367.4988095238,1.0
+sobolQRNG,_Z15sobolGPU_kerneljjPjPf,172832.0,31976.0,0.0,1250000.0,0.0,1899700.0,405.0,1151641.0,31592.0,400.0,1250000.0,0.0,2563936.0,0.6380044567750587,2.7840000000000042,112087.775,1.0
+cutlass_perf_test_k1,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,460800.0,0.0,5120.0,160.0,577120.0000000001,412167.99999999994,42.285714285714285,48640.0,412160.0,5120.0,0.0,931840.0,0.24658369358809393,60.32228571428572,139808.59999999998,1.0
+cutlass_perf_test_k2,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,2097151.9999999995,171796.0,65782.85714285714,255.99999999999994,1464319.9999999998,1081352.2857142857,45.42857142857143,1015808.0000000002,1081344.0,237568.0,0.0,4669440.0,0.38530040572560803,48.440000000000005,228263.9035714286,1.0
+cutlass_perf_test_k3,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,3276800.0000000005,429682.85714285716,164204.57142857142,640.0,2309120.0,491527.9999999999,77869.28571428571,2785279.9999999995,491519.99999999994,593920.0000000001,0.0,7741440.0,0.8525726478636384,1.832,161781.07857142857,1.0
+cudaTensorCoreGemm,_Z12compute_gemmPK6__halfS1_PKfPfff,0.0,69206016.0,0.0,2097152.0,0.0,30146560.0,16974052.0,1998866.0,52232060.0,16973824.0,2097152.0,0.0,142606336.0,0.7380984268363922,1.264000000000003,3871172.375,1.0
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 0c69c7084..5c6be224a 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -200,9 +200,6 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
diff --git a/format-code.sh b/format-code.sh
index 9f470854b..ac753f059 100755
--- a/format-code.sh
+++ b/format-code.sh
@@ -8,5 +8,5 @@ clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h
 clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.h
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.h
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.cc
\ No newline at end of file
+clang-format -i ${THIS_DIR}/src/accelwattch/*.h
+clang-format -i ${THIS_DIR}/src/accelwattch/*.cc
\ No newline at end of file
diff --git a/setup_environment b/setup_environment
index 07d078844..d3ff8403c 100644
--- a/setup_environment
+++ b/setup_environment
@@ -117,18 +117,18 @@ fi
 
 # The following checks to see if the GPGPU-Sim power model is enabled.
 # GPGPUSIM_POWER_MODEL points to the directory where gpgpusim_mcpat is located.
-# If this is not set, it checks the default directory "$GPGPUSIM_ROOT/src/gpuwattch/".
-if [ -d $GPGPUSIM_ROOT/src/gpuwattch/ ]; then
-	if [ ! -f $GPGPUSIM_ROOT/src/gpuwattch/gpgpu_sim.verify ]; then
-		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/gpuwattch";
+# If this is not set, it checks the default directory "$GPGPUSIM_ROOT/src/accelwattch/".
+if [ -d $GPGPUSIM_ROOT/src/accelwattch/ ]; then
+	if [ ! -f $GPGPUSIM_ROOT/src/accelwattch/gpgpu_sim.verify ]; then
+		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch";
 		return;
 	fi
-	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/gpuwattch/;
-	echo "configured with GPUWattch.";
+	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/accelwattch/;
+	echo "configured with AccelWattch.";
 elif [ -n "$GPGPUSIM_POWER_MODEL" ]; then
 	if [ ! -f $GPGPUSIM_POWER_MODEL/gpgpu_sim.verify ]; then
 		echo "";
-		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/gpuwattch/ - Either incorrect directory or incorrect McPAT version";
+		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch/ - Either incorrect directory or incorrect McPAT version";
 		return;
 	fi
 	echo "configure with power model in $GPGPUSIM_POWER_MODEL.";
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 30aee60c9..208047eeb 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #include "abstract_hardware_model.h"
 #include <sys/stat.h>
 #include <algorithm>
@@ -281,14 +283,16 @@ void warp_inst_t::broadcast_barrier_reduction(
 void warp_inst_t::generate_mem_accesses() {
   if (empty() || op == MEMORY_BARRIER_OP || m_mem_accesses_created) return;
   if (!((op == LOAD_OP) || (op == TENSOR_CORE_LOAD_OP) || (op == STORE_OP) ||
-        (op == TENSOR_CORE_STORE_OP)))
+        (op == TENSOR_CORE_STORE_OP) ))
     return;
   if (m_warp_active_mask.count() == 0) return;  // predicated off
 
   const size_t starting_queue_size = m_accessq.size();
 
   assert(is_load() || is_store());
-  assert(m_per_scalar_thread_valid);  // need address information per thread
+
+  //if((space.get_type() != tex_space) && (space.get_type() != const_space))
+    assert(m_per_scalar_thread_valid);  // need address information per thread
 
   bool is_write = is_store();
 
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 35e28ca57..f04741f75 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Inderpreet Singh,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -59,6 +60,30 @@ enum _memory_space_t {
   instruction_space
 };
 
+#ifndef COEFF_STRUCT
+#define COEFF_STRUCT
+
+struct PowerscalingCoefficients{
+    double int_coeff;
+    double int_mul_coeff;
+    double int_mul24_coeff;
+    double int_mul32_coeff;
+    double int_div_coeff;
+    double fp_coeff;
+    double dp_coeff;
+    double fp_mul_coeff;
+    double fp_div_coeff;
+    double dp_mul_coeff;
+    double dp_div_coeff;
+    double sqrt_coeff;
+    double log_coeff;
+    double sin_coeff;
+    double exp_coeff;
+    double tensor_coeff;
+    double tex_coeff;
+};
+#endif
+
 enum FuncCache {
   FuncCachePreferNone = 0,
   FuncCachePreferShared = 1,
@@ -134,8 +159,14 @@ enum special_operations_t {
   FP_SQRT_OP,
   FP_LG_OP,
   FP_SIN_OP,
-  FP_EXP_OP
+  FP_EXP_OP,
+  DP_MUL_OP,
+  DP_DIV_OP,
+  DP___OP,
+  TENSOR__OP,
+  TEX__OP
 };
+
 typedef enum special_operations_t
     special_ops;  // Required to identify for the power model
 enum operation_pipeline_t {
@@ -911,6 +942,7 @@ class inst_t {
     sp_op = OTHER_OP;
     op_pipe = UNKOWN_OP;
     mem_op = NOT_TEX;
+    const_cache_operand = 0;
     num_operands = 0;
     num_regs = 0;
     memset(out, 0, sizeof(unsigned));
@@ -939,6 +971,20 @@ class inst_t {
     return (op == STORE_OP || op == TENSOR_CORE_STORE_OP ||
             memory_op == memory_store);
   }
+
+  bool is_fp() const { return ((sp_op == FP__OP));}    //VIJAY
+  bool is_fpdiv() const { return ((sp_op == FP_DIV_OP));} 
+  bool is_fpmul() const { return ((sp_op == FP_MUL_OP));} 
+  bool is_dp() const { return ((sp_op == DP___OP));}    
+  bool is_dpdiv() const { return ((sp_op == DP_DIV_OP));} 
+  bool is_dpmul() const { return ((sp_op == DP_MUL_OP));}
+  bool is_imul() const { return ((sp_op == INT_MUL_OP));} 
+  bool is_imul24() const { return ((sp_op == INT_MUL24_OP));} 
+  bool is_imul32() const { return ((sp_op == INT_MUL32_OP));} 
+  bool is_idiv() const { return ((sp_op == INT_DIV_OP));}   
+  bool is_sfu() const {return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP)  || (sp_op == FP_SIN_OP)  || (sp_op == FP_EXP_OP) || (sp_op == TENSOR__OP));}
+  bool is_alu() const {return (sp_op == INT__OP);}
+
   unsigned get_num_operands() const { return num_operands; }
   unsigned get_num_regs() const { return num_regs; }
   void set_num_regs(unsigned num) { num_regs = num; }
@@ -962,6 +1008,7 @@ class inst_t {
   operation_pipeline op_pipe;  // code (uarch visible) identify the pipeline of
                                // the operation (SP, SFU or MEM)
   mem_operation mem_op;        // code (uarch visible) identify memory type
+  bool const_cache_operand;   // has a load from constant memory as an operand
   _memory_op_t memory_op;      // memory_op used by ptxplus
   unsigned num_operands;
   unsigned num_regs;  // count vector operand as one register operand
diff --git a/src/gpuwattch/Alpha21364.xml b/src/accelwattch/Alpha21364.xml
similarity index 100%
rename from src/gpuwattch/Alpha21364.xml
rename to src/accelwattch/Alpha21364.xml
diff --git a/src/gpuwattch/Niagara1.xml b/src/accelwattch/Niagara1.xml
similarity index 100%
rename from src/gpuwattch/Niagara1.xml
rename to src/accelwattch/Niagara1.xml
diff --git a/src/gpuwattch/Niagara1_sharing.xml b/src/accelwattch/Niagara1_sharing.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing.xml
rename to src/accelwattch/Niagara1_sharing.xml
diff --git a/src/gpuwattch/Niagara1_sharing_DC.xml b/src/accelwattch/Niagara1_sharing_DC.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_DC.xml
rename to src/accelwattch/Niagara1_sharing_DC.xml
diff --git a/src/gpuwattch/Niagara1_sharing_SBT.xml b/src/accelwattch/Niagara1_sharing_SBT.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_SBT.xml
rename to src/accelwattch/Niagara1_sharing_SBT.xml
diff --git a/src/gpuwattch/Niagara1_sharing_ST.xml b/src/accelwattch/Niagara1_sharing_ST.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_ST.xml
rename to src/accelwattch/Niagara1_sharing_ST.xml
diff --git a/src/gpuwattch/Niagara2.xml b/src/accelwattch/Niagara2.xml
similarity index 100%
rename from src/gpuwattch/Niagara2.xml
rename to src/accelwattch/Niagara2.xml
diff --git a/src/gpuwattch/Penryn.xml b/src/accelwattch/Penryn.xml
similarity index 100%
rename from src/gpuwattch/Penryn.xml
rename to src/accelwattch/Penryn.xml
diff --git a/src/gpuwattch/README b/src/accelwattch/README
similarity index 100%
rename from src/gpuwattch/README
rename to src/accelwattch/README
diff --git a/src/gpuwattch/XML_Parse.cc b/src/accelwattch/XML_Parse.cc
similarity index 92%
rename from src/gpuwattch/XML_Parse.cc
rename to src/accelwattch/XML_Parse.cc
index 1b9a38ae1..eaec74806 100644
--- a/src/gpuwattch/XML_Parse.cc
+++ b/src/accelwattch/XML_Parse.cc
@@ -30,12 +30,14 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
+
 #include "XML_Parse.h"
 #include <stdio.h>
 #include <string>
@@ -43,13 +45,14 @@
 
 using namespace std;
 
-const char* perf_count_label[] = {
-    "TOT_INST,",    "FP_INT,",  "IC_H,",     "IC_M,",        "DC_RH,",
-    "DC_RM,",       "DC_WH,",   "DC_WM,",    "TC_H,",        "TC_M,",
-    "CC_H,",        "CC_M,",    "SHRD_ACC,", "REG_RD,",      "REG_WR,",
-    "NON_REG_OPs,", "SP_ACC,",  "SFU_ACC,",  "FPU_ACC,",     "MEM_RD,",
-    "MEM_WR,",      "MEM_PRE,", "L2_RH,",    "L2_RM,",       "L2_WH,",
-    "L2_WM,",       "NOC_A,",   "PIPE_A,",   "IDLE_CORE_N,", "CONST_DYNAMICN"};
+const char * perf_count_label[] = {
+  "TOT_INST,", "FP_INT,", "IC_H,", "IC_M,", "DC_RH,", "DC_RM,", "DC_WH,", "DC_WM,",
+  "TC_H,", "TC_M,", "CC_H,", "CC_M,", "SHRD_ACC,", "REG_RD,", "REG_WR,", "NON_REG_OPs,",
+  "INT_ACC,", "FPU_ACC,", "DPU_ACC,", "INT_MUL24_ACC,", "INT_MUL32_ACC,", "INT_MUL_ACC,","INT_DIV_ACC,", 
+  "FP_MUL_ACC,", "FP_DIV_ACC,", "FP_SQRT_ACC,", "FP_LG_ACC,", "FP_SIN_ACC,", "FP_EXP_ACC,", "DP_MUL_ACC,", 
+  "DP_DIV_ACC,", "TENSOR_ACC,", "TEX_ACC,", "MEM_RD,","MEM_WR,", "MEM_PRE,", "L2_RH,", "L2_RM,", "L2_WH,",
+  "L2_WM,", "NOC_A,", "PIPE_A,", "IDLE_CORE_N,", "constant_power"};
+
 
 void ParseXML::parse(char* filepath) {
   unsigned int i, j, k, m, n;
@@ -160,6 +163,199 @@ void ParseXML::parse(char* filepath) {
           atoi(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "modeled_chip_voltage_ref") == 0) {
+      sys.modeled_chip_voltage_ref =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat1_flane") == 0) {
+      sys.static_cat1_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat2_flane") == 0) {
+      sys.static_cat2_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat3_flane") == 0) {
+      sys.static_cat3_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat4_flane") == 0) {
+      sys.static_cat4_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat5_flane") == 0) {
+      sys.static_cat5_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat6_flane") == 0) {
+      sys.static_cat6_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_shared_flane") == 0) {
+      sys.static_shared_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l1_flane") == 0) {
+      sys.static_l1_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l2_flane") == 0) {
+      sys.static_l2_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_light_flane") == 0) {
+      sys.static_light_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intadd_flane") == 0) {
+      sys.static_intadd_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intmul_flane") == 0) {
+      sys.static_intmul_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_geomean_flane") == 0) {
+      sys.static_geomean_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat1_addlane") == 0) {
+      sys.static_cat1_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat2_addlane") == 0) {
+      sys.static_cat2_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat3_addlane") == 0) {
+      sys.static_cat3_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat4_addlane") == 0) {
+      sys.static_cat4_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat5_addlane") == 0) {
+      sys.static_cat5_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat6_addlane") == 0) {
+      sys.static_cat6_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_shared_addlane") == 0) {
+      sys.static_shared_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l1_addlane") == 0) {
+      sys.static_l1_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l2_addlane") == 0) {
+      sys.static_l2_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_light_addlane") == 0) {
+      sys.static_light_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intadd_addlane") == 0) {
+      sys.static_intadd_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intmul_addlane") == 0) {
+      sys.static_intmul_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_geomean_addlane") == 0) {
+      sys.static_geomean_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
                "target_chip_area") == 0) {
       sys.target_chip_area =
@@ -419,22 +615,106 @@ void ParseXML::parse(char* filepath) {
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "SP_ACC") == 0) {
-      sys.scaling_coefficients[SP_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_ACC")==0) {
+      sys.scaling_coefficients[INT_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "SFU_ACC") == 0) {
-      sys.scaling_coefficients[SFU_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_ACC")==0) {
+      sys.scaling_coefficients[FP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "FPU_ACC") == 0) {
-      sys.scaling_coefficients[FPU_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_ACC")==0) {
+      sys.scaling_coefficients[DP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL24_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL24_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL32_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL32_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_DIV_ACC")==0) {
+      sys.scaling_coefficients[INT_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_MUL_ACC")==0) {
+      sys.scaling_coefficients[FP_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_DIV_ACC")==0) {
+      sys.scaling_coefficients[FP_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_SQRT_ACC")==0) {
+      sys.scaling_coefficients[FP_SQRT_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_LG_ACC")==0) {
+      sys.scaling_coefficients[FP_LG_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_SIN_ACC")==0) {
+      sys.scaling_coefficients[FP_SIN_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_EXP_ACC")==0) {
+      sys.scaling_coefficients[FP_EXP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_MUL_ACC")==0) {
+      sys.scaling_coefficients[DP_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_DIV_ACC")==0) {
+      sys.scaling_coefficients[DP_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "TENSOR_ACC")==0) {
+      sys.scaling_coefficients[TENSOR_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "TEX_ACC")==0) {
+      sys.scaling_coefficients[TEX_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
@@ -498,8 +778,8 @@ void ParseXML::parse(char* filepath) {
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "CONST_DYNAMICN") == 0) {
-      sys.scaling_coefficients[CONST_DYNAMICN] =
+               "constant_power") == 0) {
+      sys.scaling_coefficients[constant_power] =
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
@@ -4187,8 +4467,9 @@ void ParseXML::initialize()  // Initialize all
   // strcpy(sys.homogeneous_cores,"default");
   sys.core_tech_node = 1;
   sys.target_core_clockrate = 1;
+  sys.modeled_chip_voltage_ref = 1;
   sys.target_chip_area = 1;
-  sys.temperature = 1;
+  sys.temperature = 340;
   sys.number_cache_levels = 1;
   sys.homogeneous_cores = 1;
   sys.homogeneous_L1Directories = 1;
@@ -4198,6 +4479,34 @@ void ParseXML::initialize()  // Initialize all
   sys.homogeneous_NoCs = 1;
   sys.homogeneous_ccs = 1;
 
+  sys.static_cat1_flane = 0;
+  sys.static_cat2_flane = 0;
+  sys.static_cat3_flane = 0;
+  sys.static_cat4_flane = 0;
+  sys.static_cat5_flane = 0;
+  sys.static_cat6_flane = 0;
+  sys.static_shared_flane = 0;
+  sys.static_l1_flane = 0;
+  sys.static_l2_flane = 0;
+  sys.static_light_flane = 0;
+  sys.static_intadd_flane = 0;
+  sys.static_intmul_flane = 0;
+  sys.static_geomean_flane = 0;
+
+  sys.static_cat1_addlane = 0;
+  sys.static_cat2_addlane = 0;
+  sys.static_cat3_addlane = 0;
+  sys.static_cat4_addlane = 0;
+  sys.static_cat5_addlane = 0;
+  sys.static_cat6_addlane = 0;
+  sys.static_shared_addlane = 0;
+  sys.static_l1_addlane = 0;
+  sys.static_l2_addlane = 0;
+  sys.static_light_addlane = 0;
+  sys.static_intadd_addlane = 0;
+  sys.static_intmul_addlane = 0;
+  sys.static_geomean_addlane = 0;
+
   sys.Max_area_deviation = 1;
   sys.Max_power_deviation = 1;
   sys.device_type = 1;
diff --git a/src/gpuwattch/XML_Parse.h b/src/accelwattch/XML_Parse.h
similarity index 89%
rename from src/gpuwattch/XML_Parse.h
rename to src/accelwattch/XML_Parse.h
index 30c4e4b13..c82359faf 100644
--- a/src/gpuwattch/XML_Parse.h
+++ b/src/accelwattch/XML_Parse.h
@@ -30,10 +30,11 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
 #ifndef XML_PARSE_H_
@@ -69,7 +70,7 @@ ToXMLStringTool tx,tx2;
 extern const char* perf_count_label[];
 
 enum perf_count_t {
-  TOT_INST = 0,
+  TOT_INST=0,
   FP_INT,
   IC_H,
   IC_M,
@@ -85,9 +86,23 @@ enum perf_count_t {
   REG_RD,
   REG_WR,
   NON_REG_OPs,
-  SP_ACC,
-  SFU_ACC,
-  FPU_ACC,
+  INT_ACC, //SPU
+  FP_ACC, //FPU
+  DP_ACC, //FPU
+  INT_MUL24_ACC, //SFU
+  INT_MUL32_ACC, //SFU
+  INT_MUL_ACC, //SFU 
+  INT_DIV_ACC, //SFU
+  FP_MUL_ACC, //SFU
+  FP_DIV_ACC, //SFU
+  FP_SQRT_ACC, //SFU
+  FP_LG_ACC, //SFU
+  FP_SIN_ACC, //SFU
+  FP_EXP_ACC, //SFU
+  DP_MUL_ACC, //SFU
+  DP_DIV_ACC, //SFU 
+  TENSOR_ACC, //SFU
+  TEX_ACC, //SFU 
   MEM_RD,
   MEM_WR,
   MEM_PRE,
@@ -98,7 +113,7 @@ enum perf_count_t {
   NOC_A,
   PIPE_A,
   IDLE_CORE_N,
-  CONST_DYNAMICN,
+  constant_power,
   NUM_PERFORMANCE_COUNTERS
 };
 
@@ -635,6 +650,33 @@ typedef struct {
   int homogeneous_L2Directories;
   double core_tech_node;
   int target_core_clockrate;
+  double modeled_chip_voltage_ref;
+  double static_cat1_flane;
+  double static_cat2_flane;
+  double static_cat3_flane;
+  double static_cat4_flane;
+  double static_cat5_flane;
+  double static_cat6_flane;
+  double static_shared_flane;
+  double static_l1_flane;
+  double static_l2_flane;
+  double static_light_flane;
+  double static_intadd_flane;
+  double static_intmul_flane;
+  double static_geomean_flane;
+  double static_cat1_addlane;
+  double static_cat2_addlane;
+  double static_cat3_addlane;
+  double static_cat4_addlane;
+  double static_cat5_addlane;
+  double static_cat6_addlane;
+  double static_shared_addlane;
+  double static_l1_addlane;
+  double static_l2_addlane;
+  double static_light_addlane;
+  double static_intadd_addlane;
+  double static_intmul_addlane;
+  double static_geomean_addlane;
   int target_chip_area;
   int temperature;
   int number_cache_levels;
diff --git a/src/gpuwattch/Xeon.xml b/src/accelwattch/Xeon.xml
similarity index 100%
rename from src/gpuwattch/Xeon.xml
rename to src/accelwattch/Xeon.xml
diff --git a/src/gpuwattch/arch_const.h b/src/accelwattch/arch_const.h
similarity index 100%
rename from src/gpuwattch/arch_const.h
rename to src/accelwattch/arch_const.h
diff --git a/src/gpuwattch/array.cc b/src/accelwattch/array.cc
similarity index 100%
rename from src/gpuwattch/array.cc
rename to src/accelwattch/array.cc
diff --git a/src/gpuwattch/array.h b/src/accelwattch/array.h
similarity index 100%
rename from src/gpuwattch/array.h
rename to src/accelwattch/array.h
diff --git a/src/gpuwattch/basic_components.cc b/src/accelwattch/basic_components.cc
similarity index 100%
rename from src/gpuwattch/basic_components.cc
rename to src/accelwattch/basic_components.cc
diff --git a/src/gpuwattch/basic_components.h b/src/accelwattch/basic_components.h
similarity index 100%
rename from src/gpuwattch/basic_components.h
rename to src/accelwattch/basic_components.h
diff --git a/src/gpuwattch/cacti/README b/src/accelwattch/cacti/README
similarity index 100%
rename from src/gpuwattch/cacti/README
rename to src/accelwattch/cacti/README
diff --git a/src/gpuwattch/cacti/Ucache.cc b/src/accelwattch/cacti/Ucache.cc
similarity index 99%
rename from src/gpuwattch/cacti/Ucache.cc
rename to src/accelwattch/cacti/Ucache.cc
index 8f733f73b..e92e67b91 100644
--- a/src/gpuwattch/cacti/Ucache.cc
+++ b/src/accelwattch/cacti/Ucache.cc
@@ -223,7 +223,7 @@ void * calc_time_mt_wrapper(void * void_obj)
   delete tag_arr.back();
   data_arr.pop_back();
   tag_arr.pop_back();
-
+  pthread_exit(NULL);
 }
 
 
@@ -246,7 +246,7 @@ bool calculate_time(
 {
   DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
 
-  if (dyn_p.is_valid == false)
+  if (dyn_p.is_valid != true)
   {
     return false;
   }
diff --git a/src/gpuwattch/cacti/Ucache.h b/src/accelwattch/cacti/Ucache.h
similarity index 100%
rename from src/gpuwattch/cacti/Ucache.h
rename to src/accelwattch/cacti/Ucache.h
diff --git a/src/gpuwattch/cacti/arbiter.cc b/src/accelwattch/cacti/arbiter.cc
similarity index 100%
rename from src/gpuwattch/cacti/arbiter.cc
rename to src/accelwattch/cacti/arbiter.cc
diff --git a/src/gpuwattch/cacti/arbiter.h b/src/accelwattch/cacti/arbiter.h
similarity index 100%
rename from src/gpuwattch/cacti/arbiter.h
rename to src/accelwattch/cacti/arbiter.h
diff --git a/src/gpuwattch/cacti/area.cc b/src/accelwattch/cacti/area.cc
similarity index 100%
rename from src/gpuwattch/cacti/area.cc
rename to src/accelwattch/cacti/area.cc
diff --git a/src/gpuwattch/cacti/area.h b/src/accelwattch/cacti/area.h
similarity index 100%
rename from src/gpuwattch/cacti/area.h
rename to src/accelwattch/cacti/area.h
diff --git a/src/gpuwattch/cacti/bank.cc b/src/accelwattch/cacti/bank.cc
similarity index 100%
rename from src/gpuwattch/cacti/bank.cc
rename to src/accelwattch/cacti/bank.cc
diff --git a/src/gpuwattch/cacti/bank.h b/src/accelwattch/cacti/bank.h
similarity index 100%
rename from src/gpuwattch/cacti/bank.h
rename to src/accelwattch/cacti/bank.h
diff --git a/src/gpuwattch/cacti/basic_circuit.cc b/src/accelwattch/cacti/basic_circuit.cc
similarity index 100%
rename from src/gpuwattch/cacti/basic_circuit.cc
rename to src/accelwattch/cacti/basic_circuit.cc
diff --git a/src/gpuwattch/cacti/basic_circuit.h b/src/accelwattch/cacti/basic_circuit.h
similarity index 100%
rename from src/gpuwattch/cacti/basic_circuit.h
rename to src/accelwattch/cacti/basic_circuit.h
diff --git a/src/gpuwattch/cacti/batch_tests b/src/accelwattch/cacti/batch_tests
similarity index 100%
rename from src/gpuwattch/cacti/batch_tests
rename to src/accelwattch/cacti/batch_tests
diff --git a/src/gpuwattch/cacti/cache.cfg b/src/accelwattch/cacti/cache.cfg
similarity index 100%
rename from src/gpuwattch/cacti/cache.cfg
rename to src/accelwattch/cacti/cache.cfg
diff --git a/src/gpuwattch/cacti/cacti.i b/src/accelwattch/cacti/cacti.i
similarity index 100%
rename from src/gpuwattch/cacti/cacti.i
rename to src/accelwattch/cacti/cacti.i
diff --git a/src/gpuwattch/cacti/cacti.mk b/src/accelwattch/cacti/cacti.mk
similarity index 96%
rename from src/gpuwattch/cacti/cacti.mk
rename to src/accelwattch/cacti/cacti.mk
index 7f3c57338..41f9218f4 100644
--- a/src/gpuwattch/cacti/cacti.mk
+++ b/src/accelwattch/cacti/cacti.mk
@@ -1,5 +1,5 @@
 
-OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch/cacti
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/accelwattch/cacti
 TARGET = cacti
 SHELL = /bin/sh
 .PHONY: all depend clean
diff --git a/src/gpuwattch/cacti/cacti_interface.cc b/src/accelwattch/cacti/cacti_interface.cc
similarity index 100%
rename from src/gpuwattch/cacti/cacti_interface.cc
rename to src/accelwattch/cacti/cacti_interface.cc
diff --git a/src/gpuwattch/cacti/cacti_interface.h b/src/accelwattch/cacti/cacti_interface.h
similarity index 100%
rename from src/gpuwattch/cacti/cacti_interface.h
rename to src/accelwattch/cacti/cacti_interface.h
diff --git a/src/gpuwattch/cacti/component.cc b/src/accelwattch/cacti/component.cc
similarity index 100%
rename from src/gpuwattch/cacti/component.cc
rename to src/accelwattch/cacti/component.cc
diff --git a/src/gpuwattch/cacti/component.h b/src/accelwattch/cacti/component.h
similarity index 100%
rename from src/gpuwattch/cacti/component.h
rename to src/accelwattch/cacti/component.h
diff --git a/src/gpuwattch/cacti/const.h b/src/accelwattch/cacti/const.h
similarity index 100%
rename from src/gpuwattch/cacti/const.h
rename to src/accelwattch/cacti/const.h
diff --git a/src/gpuwattch/cacti/contention.dat b/src/accelwattch/cacti/contention.dat
similarity index 100%
rename from src/gpuwattch/cacti/contention.dat
rename to src/accelwattch/cacti/contention.dat
diff --git a/src/gpuwattch/cacti/crossbar.cc b/src/accelwattch/cacti/crossbar.cc
similarity index 100%
rename from src/gpuwattch/cacti/crossbar.cc
rename to src/accelwattch/cacti/crossbar.cc
diff --git a/src/gpuwattch/cacti/crossbar.h b/src/accelwattch/cacti/crossbar.h
similarity index 100%
rename from src/gpuwattch/cacti/crossbar.h
rename to src/accelwattch/cacti/crossbar.h
diff --git a/src/gpuwattch/cacti/decoder.cc b/src/accelwattch/cacti/decoder.cc
similarity index 100%
rename from src/gpuwattch/cacti/decoder.cc
rename to src/accelwattch/cacti/decoder.cc
diff --git a/src/gpuwattch/cacti/decoder.h b/src/accelwattch/cacti/decoder.h
similarity index 100%
rename from src/gpuwattch/cacti/decoder.h
rename to src/accelwattch/cacti/decoder.h
diff --git a/src/gpuwattch/cacti/highradix.cc b/src/accelwattch/cacti/highradix.cc
similarity index 100%
rename from src/gpuwattch/cacti/highradix.cc
rename to src/accelwattch/cacti/highradix.cc
diff --git a/src/gpuwattch/cacti/highradix.h b/src/accelwattch/cacti/highradix.h
similarity index 100%
rename from src/gpuwattch/cacti/highradix.h
rename to src/accelwattch/cacti/highradix.h
diff --git a/src/gpuwattch/cacti/htree2.cc b/src/accelwattch/cacti/htree2.cc
similarity index 100%
rename from src/gpuwattch/cacti/htree2.cc
rename to src/accelwattch/cacti/htree2.cc
diff --git a/src/gpuwattch/cacti/htree2.h b/src/accelwattch/cacti/htree2.h
similarity index 100%
rename from src/gpuwattch/cacti/htree2.h
rename to src/accelwattch/cacti/htree2.h
diff --git a/src/gpuwattch/cacti/io.cc b/src/accelwattch/cacti/io.cc
similarity index 100%
rename from src/gpuwattch/cacti/io.cc
rename to src/accelwattch/cacti/io.cc
diff --git a/src/gpuwattch/cacti/io.h b/src/accelwattch/cacti/io.h
similarity index 100%
rename from src/gpuwattch/cacti/io.h
rename to src/accelwattch/cacti/io.h
diff --git a/src/gpuwattch/cacti/main.cc b/src/accelwattch/cacti/main.cc
similarity index 100%
rename from src/gpuwattch/cacti/main.cc
rename to src/accelwattch/cacti/main.cc
diff --git a/src/gpuwattch/cacti/makefile b/src/accelwattch/cacti/makefile
similarity index 100%
rename from src/gpuwattch/cacti/makefile
rename to src/accelwattch/cacti/makefile
diff --git a/src/gpuwattch/cacti/mat.cc b/src/accelwattch/cacti/mat.cc
similarity index 100%
rename from src/gpuwattch/cacti/mat.cc
rename to src/accelwattch/cacti/mat.cc
diff --git a/src/gpuwattch/cacti/mat.h b/src/accelwattch/cacti/mat.h
similarity index 100%
rename from src/gpuwattch/cacti/mat.h
rename to src/accelwattch/cacti/mat.h
diff --git a/src/gpuwattch/cacti/nuca.cc b/src/accelwattch/cacti/nuca.cc
similarity index 100%
rename from src/gpuwattch/cacti/nuca.cc
rename to src/accelwattch/cacti/nuca.cc
diff --git a/src/gpuwattch/cacti/nuca.h b/src/accelwattch/cacti/nuca.h
similarity index 100%
rename from src/gpuwattch/cacti/nuca.h
rename to src/accelwattch/cacti/nuca.h
diff --git a/src/gpuwattch/cacti/out_batch_test_result.csv b/src/accelwattch/cacti/out_batch_test_result.csv
similarity index 100%
rename from src/gpuwattch/cacti/out_batch_test_result.csv
rename to src/accelwattch/cacti/out_batch_test_result.csv
diff --git a/src/gpuwattch/cacti/parameter.cc b/src/accelwattch/cacti/parameter.cc
similarity index 100%
rename from src/gpuwattch/cacti/parameter.cc
rename to src/accelwattch/cacti/parameter.cc
diff --git a/src/gpuwattch/cacti/parameter.h b/src/accelwattch/cacti/parameter.h
similarity index 100%
rename from src/gpuwattch/cacti/parameter.h
rename to src/accelwattch/cacti/parameter.h
diff --git a/src/gpuwattch/cacti/router.cc b/src/accelwattch/cacti/router.cc
similarity index 100%
rename from src/gpuwattch/cacti/router.cc
rename to src/accelwattch/cacti/router.cc
diff --git a/src/gpuwattch/cacti/router.h b/src/accelwattch/cacti/router.h
similarity index 100%
rename from src/gpuwattch/cacti/router.h
rename to src/accelwattch/cacti/router.h
diff --git a/src/gpuwattch/cacti/subarray.cc b/src/accelwattch/cacti/subarray.cc
similarity index 100%
rename from src/gpuwattch/cacti/subarray.cc
rename to src/accelwattch/cacti/subarray.cc
diff --git a/src/gpuwattch/cacti/subarray.h b/src/accelwattch/cacti/subarray.h
similarity index 100%
rename from src/gpuwattch/cacti/subarray.h
rename to src/accelwattch/cacti/subarray.h
diff --git a/src/gpuwattch/cacti/technology.cc b/src/accelwattch/cacti/technology.cc
similarity index 100%
rename from src/gpuwattch/cacti/technology.cc
rename to src/accelwattch/cacti/technology.cc
diff --git a/src/gpuwattch/cacti/uca.cc b/src/accelwattch/cacti/uca.cc
similarity index 100%
rename from src/gpuwattch/cacti/uca.cc
rename to src/accelwattch/cacti/uca.cc
diff --git a/src/gpuwattch/cacti/uca.h b/src/accelwattch/cacti/uca.h
similarity index 100%
rename from src/gpuwattch/cacti/uca.h
rename to src/accelwattch/cacti/uca.h
diff --git a/src/gpuwattch/cacti/wire.cc b/src/accelwattch/cacti/wire.cc
similarity index 100%
rename from src/gpuwattch/cacti/wire.cc
rename to src/accelwattch/cacti/wire.cc
diff --git a/src/gpuwattch/cacti/wire.h b/src/accelwattch/cacti/wire.h
similarity index 100%
rename from src/gpuwattch/cacti/wire.h
rename to src/accelwattch/cacti/wire.h
diff --git a/src/gpuwattch/core.cc b/src/accelwattch/core.cc
similarity index 100%
rename from src/gpuwattch/core.cc
rename to src/accelwattch/core.cc
diff --git a/src/gpuwattch/core.h b/src/accelwattch/core.h
similarity index 100%
rename from src/gpuwattch/core.h
rename to src/accelwattch/core.h
diff --git a/src/gpuwattch/fermi.xml b/src/accelwattch/fermi.xml
similarity index 100%
rename from src/gpuwattch/fermi.xml
rename to src/accelwattch/fermi.xml
diff --git a/src/gpuwattch/globalvar.h b/src/accelwattch/globalvar.h
similarity index 100%
rename from src/gpuwattch/globalvar.h
rename to src/accelwattch/globalvar.h
diff --git a/src/gpuwattch/gpgpu.xml b/src/accelwattch/gpgpu.xml
similarity index 100%
rename from src/gpuwattch/gpgpu.xml
rename to src/accelwattch/gpgpu.xml
diff --git a/src/gpuwattch/gpgpu_sim.verify b/src/accelwattch/gpgpu_sim.verify
similarity index 100%
rename from src/gpuwattch/gpgpu_sim.verify
rename to src/accelwattch/gpgpu_sim.verify
diff --git a/src/accelwattch/gpgpu_sim_wrapper.cc b/src/accelwattch/gpgpu_sim_wrapper.cc
new file mode 100644
index 000000000..67d9daa1f
--- /dev/null
+++ b/src/accelwattch/gpgpu_sim_wrapper.cc
@@ -0,0 +1,1143 @@
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include "gpgpu_sim_wrapper.h"
+#include <sys/stat.h>
+#define SP_BASE_POWER 0
+#define SFU_BASE_POWER 0
+
+static const char* pwr_cmp_label[] = {
+    "IBP,", "ICP,", "DCP,", "TCP,", "CCP,", "SHRDP,", "RFP,", "INTP,", 
+    "FPUP,", "DPUP,", "INT_MUL24P,", "INT_MUL32P,", "INT_MULP,", "INT_DIVP,", 
+    "FP_MULP,", "FP_DIVP,", "FP_SQRTP,", "FP_LGP,", "FP_SINP,", "FP_EXP,", 
+    "DP_MULP,", "DP_DIVP,", "TENSORP,", "TEXP,", "SCHEDP,", "L2CP,", "MCP,", "NOCP,", 
+    "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONSTP", "STATICP"};
+
+enum pwr_cmp_t {
+  IBP=0,
+  ICP,
+  DCP,
+  TCP,
+  CCP,
+  SHRDP,
+  RFP,
+  INTP,
+  FPUP,
+  DPUP,
+  INT_MUL24P,
+  INT_MUL32P,
+  INT_MULP,
+  INT_DIVP,
+  FP_MULP,
+  FP_DIVP,
+  FP_SQRTP,
+  FP_LGP,
+  FP_SINP,
+  FP_EXP,
+  DP_MULP,
+  DP_DIVP,
+  TENSORP,
+  TEXP,
+  SCHEDP,
+  L2CP,
+  MCP,
+  NOCP,
+  DRAMP,
+  PIPEP,
+  IDLE_COREP,
+  CONSTP,
+  STATICP,
+  NUM_COMPONENTS_MODELLED
+};
+
+gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
+                                     char* xmlfile, int power_simulation_mode, bool dvfs_enabled) {
+  kernel_sample_count = 0;
+  total_sample_count = 0;
+
+  kernel_tot_power = 0;
+  avg_threads_per_warp_tot = 0;
+  num_pwr_cmps = NUM_COMPONENTS_MODELLED;
+  num_perf_counters = NUM_PERFORMANCE_COUNTERS;
+
+  // Initialize per-component counter/power vectors
+  avg_max_min_counters<double> init;
+  kernel_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, init);
+  kernel_cmp_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, init);
+
+  kernel_power = init;   // Per-kernel powers
+  gpu_tot_power = init;  // Global powers
+
+  sample_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, 0);
+
+  sample_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, 0);
+  initpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
+  effpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
+
+  const_dynamic_power = 0;
+  proc_power = 0;
+
+  g_power_filename = NULL;
+  g_power_trace_filename = NULL;
+  g_metric_trace_filename = NULL;
+  g_steady_state_tracking_filename = NULL;
+  xml_filename = xmlfile;
+  g_power_simulation_enabled = power_simulation_enabled;
+  g_power_simulation_mode = power_simulation_mode;
+  g_dvfs_enabled = dvfs_enabled;
+  g_power_trace_enabled = false;
+  g_steady_power_levels_enabled = false;
+  g_power_trace_zlevel = 0;
+  g_power_per_cycle_dump = false;
+  gpu_steady_power_deviation = 0;
+  gpu_steady_min_period = 0;
+
+  gpu_stat_sample_freq = 0;
+  p = new ParseXML();
+  if (g_power_simulation_enabled) {
+    p->parse(xml_filename);
+  }
+  proc = new Processor(p);
+  power_trace_file = NULL;
+  metric_trace_file = NULL;
+  steady_state_tacking_file = NULL;
+  has_written_avg = false;
+  init_inst_val = false;
+}
+
+gpgpu_sim_wrapper::~gpgpu_sim_wrapper() {}
+
+bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
+  if (b == 0)
+    return (abs(a - b) < 0.00001);
+  else
+    return (abs(a - b) / abs(b) < 0.00001);
+
+  return false;
+}
+void gpgpu_sim_wrapper::init_mcpat_hw_mode(unsigned gpu_sim_cycle) {
+   p->sys.total_cycles = gpu_sim_cycle; //total simulated cycles for current kernel
+}
+
+void gpgpu_sim_wrapper::init_mcpat(
+    char* xmlfile, char* powerfilename, char* power_trace_filename,
+    char* metric_trace_filename, char* steady_state_filename,
+    bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
+    bool power_per_cycle_dump, double steady_power_deviation,
+    double steady_min_period, int zlevel, double init_val,
+    int stat_sample_freq, int power_sim_mode, bool dvfs_enabled,
+    unsigned clock_freq, unsigned num_shaders) {
+  // Write File Headers for (-metrics trace, -power trace)
+
+  reset_counters();
+  static bool mcpat_init = true;
+
+  // initialize file name if it is not set
+  time_t curr_time;
+  time(&curr_time);
+  char* date = ctime(&curr_time);
+  char* s = date;
+  while (*s) {
+    if (*s == ' ' || *s == '\t' || *s == ':') *s = '-';
+    if (*s == '\n' || *s == '\r') *s = 0;
+    s++;
+  }
+
+  if (mcpat_init) {
+    g_power_filename = powerfilename;
+    g_power_trace_filename = power_trace_filename;
+    g_metric_trace_filename = metric_trace_filename;
+    g_steady_state_tracking_filename = steady_state_filename;
+    xml_filename = xmlfile;
+    g_power_simulation_enabled = power_sim_enabled;
+    g_power_simulation_mode = power_sim_mode;
+    g_dvfs_enabled = dvfs_enabled;
+    g_power_trace_enabled = trace_enabled;
+    g_steady_power_levels_enabled = steady_state_enabled;
+    g_power_trace_zlevel = zlevel;
+    g_power_per_cycle_dump = power_per_cycle_dump;
+    gpu_steady_power_deviation = steady_power_deviation;
+    gpu_steady_min_period = steady_min_period;
+
+    gpu_stat_sample_freq = stat_sample_freq;
+
+    // p->sys.total_cycles=gpu_stat_sample_freq*4;
+    p->sys.total_cycles = gpu_stat_sample_freq;
+    p->sys.target_core_clockrate = clock_freq;
+    p->sys.number_of_cores = num_shaders;
+    p->sys.core[0].clock_rate = clock_freq;
+    power_trace_file = NULL;
+    metric_trace_file = NULL;
+    steady_state_tacking_file = NULL;
+
+    if (g_power_trace_enabled) {
+      power_trace_file = gzopen(g_power_trace_filename, "w");
+      metric_trace_file = gzopen(g_metric_trace_filename, "w");
+      if ((power_trace_file == NULL) || (metric_trace_file == NULL)) {
+        printf("error - could not open trace files \n");
+        exit(1);
+      }
+      gzsetparams(power_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
+
+      gzprintf(power_trace_file, "power,");
+      for (unsigned i = 0; i < num_pwr_cmps; i++) {
+        gzprintf(power_trace_file, pwr_cmp_label[i]);
+      }
+      gzprintf(power_trace_file, "\n");
+
+      gzsetparams(metric_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
+      for (unsigned i = 0; i < num_perf_counters; i++) {
+        gzprintf(metric_trace_file, perf_count_label[i]);
+      }
+      gzprintf(metric_trace_file, "\n");
+
+      gzclose(power_trace_file);
+      gzclose(metric_trace_file);
+    }
+    if (g_steady_power_levels_enabled) {
+      steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "w");
+      if ((steady_state_tacking_file == NULL)) {
+        printf("error - could not open trace files \n");
+        exit(1);
+      }
+      gzsetparams(steady_state_tacking_file, g_power_trace_zlevel,
+                  Z_DEFAULT_STRATEGY);
+      gzprintf(steady_state_tacking_file, "start,end,power,IPC,");
+      for (unsigned i = 0; i < num_perf_counters; i++) {
+        gzprintf(steady_state_tacking_file, perf_count_label[i]);
+      }
+      gzprintf(steady_state_tacking_file, "\n");
+
+      gzclose(steady_state_tacking_file);
+    }
+
+    mcpat_init = false;
+    has_written_avg = false;
+    powerfile.open(g_power_filename);
+    int flg = chmod(g_power_filename, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    assert(flg == 0);
+  }
+  sample_val = 0;
+  init_inst_val = init_val;  // gpu_tot_sim_insn+gpu_sim_insn;
+}
+
+void gpgpu_sim_wrapper::reset_counters() {
+  avg_max_min_counters<double> init;
+  for (unsigned i = 0; i < num_perf_counters; ++i) {
+    sample_perf_counters[i] = 0;
+    kernel_cmp_perf_counters[i] = init;
+  }
+  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+    sample_cmp_pwr[i] = 0;
+    kernel_cmp_pwr[i] = init;
+  }
+
+  // Reset per-kernel counters
+  kernel_sample_count = 0;
+  kernel_tot_power = 0;
+  kernel_power = init;
+  avg_threads_per_warp_tot = 0;
+  return;
+}
+
+void gpgpu_sim_wrapper::set_inst_power(bool clk_gated_lanes, double tot_cycles,
+                                       double busy_cycles, double tot_inst,
+                                       double int_inst, double fp_inst,
+                                       double load_inst, double store_inst,
+                                       double committed_inst) {
+  p->sys.core[0].gpgpu_clock_gated_lanes = clk_gated_lanes;
+  p->sys.core[0].total_cycles = tot_cycles;
+  p->sys.core[0].busy_cycles = busy_cycles;
+  p->sys.core[0].total_instructions =
+      tot_inst * p->sys.scaling_coefficients[TOT_INST];
+  p->sys.core[0].int_instructions =
+      int_inst * p->sys.scaling_coefficients[FP_INT];
+  p->sys.core[0].fp_instructions =
+      fp_inst * p->sys.scaling_coefficients[FP_INT];
+  p->sys.core[0].load_instructions = load_inst;
+  p->sys.core[0].store_instructions = store_inst;
+  p->sys.core[0].committed_instructions = committed_inst;
+  sample_perf_counters[FP_INT] = int_inst + fp_inst;
+  sample_perf_counters[TOT_INST] = tot_inst;
+}
+
+void gpgpu_sim_wrapper::set_regfile_power(double reads, double writes,
+                                          double ops) {
+  p->sys.core[0].int_regfile_reads =
+      reads * p->sys.scaling_coefficients[REG_RD];
+  p->sys.core[0].int_regfile_writes =
+      writes * p->sys.scaling_coefficients[REG_WR];
+  p->sys.core[0].non_rf_operands =
+      ops * p->sys.scaling_coefficients[NON_REG_OPs];
+  sample_perf_counters[REG_RD] = reads;
+  sample_perf_counters[REG_WR] = writes;
+  sample_perf_counters[NON_REG_OPs] = ops;
+}
+
+void gpgpu_sim_wrapper::set_icache_power(double hits, double misses) {
+  p->sys.core[0].icache.read_accesses =
+      hits * p->sys.scaling_coefficients[IC_H] +
+      misses * p->sys.scaling_coefficients[IC_M];
+  p->sys.core[0].icache.read_misses =
+      misses * p->sys.scaling_coefficients[IC_M];
+  sample_perf_counters[IC_H] = hits;
+  sample_perf_counters[IC_M] = misses;
+}
+
+void gpgpu_sim_wrapper::set_ccache_power(double hits, double misses) {
+  p->sys.core[0].ccache.read_accesses =
+      hits * p->sys.scaling_coefficients[CC_H] +
+      misses * p->sys.scaling_coefficients[CC_M];
+  p->sys.core[0].ccache.read_misses =
+      misses * p->sys.scaling_coefficients[CC_M];
+  sample_perf_counters[CC_H] = hits;
+  sample_perf_counters[CC_M] = misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_tcache_power(double hits, double misses) {
+  p->sys.core[0].tcache.read_accesses =
+      hits * p->sys.scaling_coefficients[TC_H] +
+      misses * p->sys.scaling_coefficients[TC_M];
+  p->sys.core[0].tcache.read_misses =
+      misses * p->sys.scaling_coefficients[TC_M];
+  sample_perf_counters[TC_H] = hits;
+  sample_perf_counters[TC_M] = misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_shrd_mem_power(double accesses) {
+  p->sys.core[0].sharedmemory.read_accesses =
+      accesses * p->sys.scaling_coefficients[SHRD_ACC];
+  sample_perf_counters[SHRD_ACC] = accesses;
+}
+
+void gpgpu_sim_wrapper::set_l1cache_power(double read_hits, double read_misses,
+                                          double write_hits,
+                                          double write_misses) {
+  p->sys.core[0].dcache.read_accesses =
+      read_hits * p->sys.scaling_coefficients[DC_RH] +
+      read_misses * p->sys.scaling_coefficients[DC_RM];
+  p->sys.core[0].dcache.read_misses =
+      read_misses * p->sys.scaling_coefficients[DC_RM];
+  p->sys.core[0].dcache.write_accesses =
+      write_hits * p->sys.scaling_coefficients[DC_WH] +
+      write_misses * p->sys.scaling_coefficients[DC_WM];
+  p->sys.core[0].dcache.write_misses =
+      write_misses * p->sys.scaling_coefficients[DC_WM];
+  sample_perf_counters[DC_RH] = read_hits;
+  sample_perf_counters[DC_RM] = read_misses;
+  sample_perf_counters[DC_WH] = write_hits;
+  sample_perf_counters[DC_WM] = write_misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
+                                          double write_hits,
+                                          double write_misses) {
+  p->sys.l2.total_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
+                             read_misses * p->sys.scaling_coefficients[L2_RM] +
+                             write_hits * p->sys.scaling_coefficients[L2_WH] +
+                             write_misses * p->sys.scaling_coefficients[L2_WM];
+  p->sys.l2.read_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
+                            read_misses * p->sys.scaling_coefficients[L2_RM];
+  p->sys.l2.write_accesses = write_hits * p->sys.scaling_coefficients[L2_WH] +
+                             write_misses * p->sys.scaling_coefficients[L2_WM];
+  p->sys.l2.read_hits = read_hits * p->sys.scaling_coefficients[L2_RH];
+  p->sys.l2.read_misses = read_misses * p->sys.scaling_coefficients[L2_RM];
+  p->sys.l2.write_hits = write_hits * p->sys.scaling_coefficients[L2_WH];
+  p->sys.l2.write_misses = write_misses * p->sys.scaling_coefficients[L2_WM];
+  sample_perf_counters[L2_RH] = read_hits;
+  sample_perf_counters[L2_RM] = read_misses;
+  sample_perf_counters[L2_WH] = write_hits;
+  sample_perf_counters[L2_WM] = write_misses;
+}
+
+void gpgpu_sim_wrapper::set_num_cores(double num_core) {
+  
+  num_cores = num_core;
+}
+
+void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
+  p->sys.num_idle_cores = num_idle_core;
+  sample_perf_counters[IDLE_CORE_N] = num_idle_core;
+  num_idle_cores = num_idle_core;
+}
+
+void gpgpu_sim_wrapper::set_duty_cycle_power(double duty_cycle) {
+  p->sys.core[0].pipeline_duty_cycle =
+      duty_cycle * p->sys.scaling_coefficients[PIPE_A];
+  sample_perf_counters[PIPE_A] = duty_cycle;
+}
+
+void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
+                                           double dram_precharge) {
+  p->sys.mc.memory_accesses = reads * p->sys.scaling_coefficients[MEM_RD] +
+                              writes * p->sys.scaling_coefficients[MEM_WR];
+  p->sys.mc.memory_reads = reads * p->sys.scaling_coefficients[MEM_RD];
+  p->sys.mc.memory_writes = writes * p->sys.scaling_coefficients[MEM_WR];
+  p->sys.mc.dram_pre = dram_precharge * p->sys.scaling_coefficients[MEM_PRE];
+  sample_perf_counters[MEM_RD] = reads;
+  sample_perf_counters[MEM_WR] = writes;
+  sample_perf_counters[MEM_PRE] = dram_precharge;
+}
+
+
+void gpgpu_sim_wrapper::set_model_voltage(double model_voltage) {
+	modeled_chip_voltage = model_voltage;
+}
+
+
+void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
+                                            double ialu_accesses,
+                                            double sfu_accesses) {
+  p->sys.core[0].fpu_accesses = fpu_accesses;
+  tot_fpu_accesses = fpu_accesses;
+  //Integer ALU (not present in Tesla)
+  p->sys.core[0].ialu_accesses = ialu_accesses;
+
+  //Sfu accesses
+  p->sys.core[0].mul_accesses = sfu_accesses;
+  tot_sfu_accesses = sfu_accesses;
+}
+
+PowerscalingCoefficients * gpgpu_sim_wrapper::get_scaling_coeffs()
+{
+
+  PowerscalingCoefficients * scalingCoeffs = new PowerscalingCoefficients();
+
+  scalingCoeffs->int_coeff = p->sys.scaling_coefficients[INT_ACC];
+  scalingCoeffs->int_mul_coeff = p->sys.scaling_coefficients[INT_MUL_ACC];
+  scalingCoeffs->int_mul24_coeff = p->sys.scaling_coefficients[INT_MUL24_ACC];
+  scalingCoeffs->int_mul32_coeff = p->sys.scaling_coefficients[INT_MUL32_ACC];
+  scalingCoeffs->int_div_coeff = p->sys.scaling_coefficients[INT_DIV_ACC];
+  scalingCoeffs->fp_coeff = p->sys.scaling_coefficients[FP_ACC];
+  scalingCoeffs->dp_coeff = p->sys.scaling_coefficients[DP_ACC];
+  scalingCoeffs->fp_mul_coeff = p->sys.scaling_coefficients[FP_MUL_ACC];
+  scalingCoeffs->fp_div_coeff = p->sys.scaling_coefficients[FP_DIV_ACC];
+  scalingCoeffs->dp_mul_coeff = p->sys.scaling_coefficients[DP_MUL_ACC];
+  scalingCoeffs->dp_div_coeff = p->sys.scaling_coefficients[DP_DIV_ACC];
+  scalingCoeffs->sqrt_coeff = p->sys.scaling_coefficients[FP_SQRT_ACC];
+  scalingCoeffs->log_coeff = p->sys.scaling_coefficients[FP_LG_ACC];
+  scalingCoeffs->sin_coeff = p->sys.scaling_coefficients[FP_SIN_ACC];
+  scalingCoeffs->exp_coeff = p->sys.scaling_coefficients[FP_EXP_ACC];
+  scalingCoeffs->tensor_coeff = p->sys.scaling_coefficients[TENSOR_ACC];
+  scalingCoeffs->tex_coeff = p->sys.scaling_coefficients[TEX_ACC];
+  return scalingCoeffs;
+
+}
+
+void gpgpu_sim_wrapper::set_int_accesses(double ialu_accesses, 
+                                        double imul24_accesses, 
+                                        double imul32_accesses, 
+                                        double imul_accesses, 
+                                        double idiv_accesses)
+{
+
+  sample_perf_counters[INT_ACC]=ialu_accesses;
+  sample_perf_counters[INT_MUL24_ACC]=imul24_accesses;
+  sample_perf_counters[INT_MUL32_ACC]=imul32_accesses;
+  sample_perf_counters[INT_MUL_ACC]=imul_accesses;
+  sample_perf_counters[INT_DIV_ACC]=idiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_dp_accesses(double dpu_accesses, 
+                                        double dpmul_accesses, 
+                                        double dpdiv_accesses)
+{
+  sample_perf_counters[DP_ACC]=dpu_accesses;
+  sample_perf_counters[DP_MUL_ACC]=dpmul_accesses;
+  sample_perf_counters[DP_DIV_ACC]=dpdiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_fp_accesses(double fpu_accesses, 
+                                        double fpmul_accesses, 
+                                        double fpdiv_accesses)
+{
+  sample_perf_counters[FP_ACC]=fpu_accesses;
+  sample_perf_counters[FP_MUL_ACC]=fpmul_accesses;
+  sample_perf_counters[FP_DIV_ACC]=fpdiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_trans_accesses(double sqrt_accesses, 
+                                           double log_accesses, 
+                                           double sin_accesses, 
+                                           double exp_accesses)
+{
+
+  sample_perf_counters[FP_SQRT_ACC]=sqrt_accesses;
+  sample_perf_counters[FP_LG_ACC]=log_accesses;
+  sample_perf_counters[FP_SIN_ACC]=sin_accesses;
+  sample_perf_counters[FP_EXP_ACC]=exp_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_tensor_accesses(double tensor_accesses)
+{
+  sample_perf_counters[TENSOR_ACC]=tensor_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_tex_accesses(double tex_accesses)
+{
+  sample_perf_counters[TEX_ACC]=tex_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_avg_active_threads(float active_threads)
+{
+  avg_threads_per_warp = (unsigned)ceil(active_threads);
+  avg_threads_per_warp_tot += active_threads;
+}
+
+void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
+                                               double sfu_avg_active_lane) {
+  p->sys.core[0].sp_average_active_lanes = sp_avg_active_lane;
+  p->sys.core[0].sfu_average_active_lanes = sfu_avg_active_lane;
+}
+
+void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_acc) {
+  p->sys.NoC[0].total_accesses =
+      noc_tot_acc * p->sys.scaling_coefficients[NOC_A];
+  sample_perf_counters[NOC_A] = noc_tot_acc;
+}
+
+void gpgpu_sim_wrapper::power_metrics_calculations() {
+  total_sample_count++;
+  kernel_sample_count++;
+
+  // Current sample power
+  double sample_power = proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONSTP] + sample_cmp_pwr[STATICP];
+  // double sample_power;
+  // for(unsigned i=0; i<num_pwr_cmps; i++){
+  //   sample_power+=sample_cmp_pwr[i]; //fix for dvfs
+  // }
+
+  // Average power
+  // Previous + new + constant dynamic power (e.g., dynamic clocking power)
+  kernel_tot_power += sample_power;
+  kernel_power.avg = kernel_tot_power / kernel_sample_count;
+  for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+    kernel_cmp_pwr[ind].avg += (double)sample_cmp_pwr[ind];
+  }
+
+  for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+    kernel_cmp_perf_counters[ind].avg += (double)sample_perf_counters[ind];
+  }
+
+  // Max Power
+  if (sample_power > kernel_power.max) {
+    kernel_power.max = sample_power;
+    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+      kernel_cmp_pwr[ind].max = (double)sample_cmp_pwr[ind];
+    }
+    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+      kernel_cmp_perf_counters[ind].max = sample_perf_counters[ind];
+    }
+  }
+
+  // Min Power
+  if (sample_power < kernel_power.min || (kernel_power.min == 0)) {
+    kernel_power.min = sample_power;
+    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+      kernel_cmp_pwr[ind].min = (double)sample_cmp_pwr[ind];
+    }
+    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+      kernel_cmp_perf_counters[ind].min = sample_perf_counters[ind];
+    }
+  }
+
+  gpu_tot_power.avg = (gpu_tot_power.avg + sample_power);
+  gpu_tot_power.max =
+      (sample_power > gpu_tot_power.max) ? sample_power : gpu_tot_power.max;
+  gpu_tot_power.min =
+      ((sample_power < gpu_tot_power.min) || (gpu_tot_power.min == 0))
+          ? sample_power
+          : gpu_tot_power.min;
+}
+
+void gpgpu_sim_wrapper::print_trace_files() {
+  open_files();
+
+  for (unsigned i = 0; i < num_perf_counters; ++i) {
+    gzprintf(metric_trace_file, "%f,", sample_perf_counters[i]);
+  }
+  gzprintf(metric_trace_file, "\n");
+
+  gzprintf(power_trace_file, "%f,", proc_power);
+  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+    gzprintf(power_trace_file, "%f,", sample_cmp_pwr[i]);
+  }
+  gzprintf(power_trace_file, "\n");
+
+  close_files();
+}
+
+void gpgpu_sim_wrapper::update_coefficients()
+{
+
+  initpower_coeff[FP_INT]=proc->cores[0]->get_coefficient_fpint_insts();
+  effpower_coeff[FP_INT]=initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
+
+  initpower_coeff[TOT_INST]=proc->cores[0]->get_coefficient_tot_insts();
+  effpower_coeff[TOT_INST]=initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
+
+  initpower_coeff[REG_RD]=proc->cores[0]->get_coefficient_regreads_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  initpower_coeff[REG_WR]=proc->cores[0]->get_coefficient_regwrites_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  initpower_coeff[NON_REG_OPs]=proc->cores[0]->get_coefficient_noregfileops_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  effpower_coeff[REG_RD]=initpower_coeff[REG_RD]*p->sys.scaling_coefficients[REG_RD];
+  effpower_coeff[REG_WR]=initpower_coeff[REG_WR]*p->sys.scaling_coefficients[REG_WR];
+  effpower_coeff[NON_REG_OPs]=initpower_coeff[NON_REG_OPs]*p->sys.scaling_coefficients[NON_REG_OPs];
+
+  initpower_coeff[IC_H]=proc->cores[0]->get_coefficient_icache_hits();
+  initpower_coeff[IC_M]=proc->cores[0]->get_coefficient_icache_misses();
+  effpower_coeff[IC_H]=initpower_coeff[IC_H]*p->sys.scaling_coefficients[IC_H];
+  effpower_coeff[IC_M]=initpower_coeff[IC_M]*p->sys.scaling_coefficients[IC_M];
+
+  initpower_coeff[CC_H]=(proc->cores[0]->get_coefficient_ccache_readhits()+proc->get_coefficient_readcoalescing());
+  initpower_coeff[CC_M]=(proc->cores[0]->get_coefficient_ccache_readmisses()+proc->get_coefficient_readcoalescing());
+  effpower_coeff[CC_H]=initpower_coeff[CC_H]*p->sys.scaling_coefficients[CC_H];
+  effpower_coeff[CC_M]=initpower_coeff[CC_M]*p->sys.scaling_coefficients[CC_M];
+
+  initpower_coeff[TC_H]=(proc->cores[0]->get_coefficient_tcache_readhits()+proc->get_coefficient_readcoalescing());
+  initpower_coeff[TC_M]=(proc->cores[0]->get_coefficient_tcache_readmisses()+proc->get_coefficient_readcoalescing());
+  effpower_coeff[TC_H]=initpower_coeff[TC_H]*p->sys.scaling_coefficients[TC_H];
+  effpower_coeff[TC_M]=initpower_coeff[TC_M]*p->sys.scaling_coefficients[TC_M];
+
+  initpower_coeff[SHRD_ACC]=proc->cores[0]->get_coefficient_sharedmemory_readhits();
+  effpower_coeff[SHRD_ACC]=initpower_coeff[SHRD_ACC]*p->sys.scaling_coefficients[SHRD_ACC];
+
+  initpower_coeff[DC_RH]=(proc->cores[0]->get_coefficient_dcache_readhits() + proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_RM]=(proc->cores[0]->get_coefficient_dcache_readmisses() + proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_WH]=(proc->cores[0]->get_coefficient_dcache_writehits() + proc->get_coefficient_writecoalescing());
+  initpower_coeff[DC_WM]=(proc->cores[0]->get_coefficient_dcache_writemisses() + proc->get_coefficient_writecoalescing());
+  effpower_coeff[DC_RH]=initpower_coeff[DC_RH]*p->sys.scaling_coefficients[DC_RH];
+  effpower_coeff[DC_RM]=initpower_coeff[DC_RM]*p->sys.scaling_coefficients[DC_RM];
+  effpower_coeff[DC_WH]=initpower_coeff[DC_WH]*p->sys.scaling_coefficients[DC_WH];
+  effpower_coeff[DC_WM]=initpower_coeff[DC_WM]*p->sys.scaling_coefficients[DC_WM];
+
+  initpower_coeff[L2_RH]=proc->get_coefficient_l2_read_hits();
+  initpower_coeff[L2_RM]=proc->get_coefficient_l2_read_misses();
+  initpower_coeff[L2_WH]=proc->get_coefficient_l2_write_hits();
+  initpower_coeff[L2_WM]=proc->get_coefficient_l2_write_misses();
+  effpower_coeff[L2_RH]=initpower_coeff[L2_RH]*p->sys.scaling_coefficients[L2_RH];
+  effpower_coeff[L2_RM]=initpower_coeff[L2_RM]*p->sys.scaling_coefficients[L2_RM];
+  effpower_coeff[L2_WH]=initpower_coeff[L2_WH]*p->sys.scaling_coefficients[L2_WH];
+  effpower_coeff[L2_WM]=initpower_coeff[L2_WM]*p->sys.scaling_coefficients[L2_WM];
+
+  initpower_coeff[IDLE_CORE_N]=p->sys.idle_core_power * proc->cores[0]->executionTime;
+  effpower_coeff[IDLE_CORE_N]=initpower_coeff[IDLE_CORE_N]*p->sys.scaling_coefficients[IDLE_CORE_N];
+
+  initpower_coeff[PIPE_A]=proc->cores[0]->get_coefficient_duty_cycle();
+  effpower_coeff[PIPE_A]=initpower_coeff[PIPE_A]*p->sys.scaling_coefficients[PIPE_A];
+
+  initpower_coeff[MEM_RD]=proc->get_coefficient_mem_reads();
+  initpower_coeff[MEM_WR]=proc->get_coefficient_mem_writes();
+  initpower_coeff[MEM_PRE]=proc->get_coefficient_mem_pre();
+  effpower_coeff[MEM_RD]=initpower_coeff[MEM_RD]*p->sys.scaling_coefficients[MEM_RD];
+  effpower_coeff[MEM_WR]=initpower_coeff[MEM_WR]*p->sys.scaling_coefficients[MEM_WR];
+  effpower_coeff[MEM_PRE]=initpower_coeff[MEM_PRE]*p->sys.scaling_coefficients[MEM_PRE];
+  
+  double fp_coeff = proc->cores[0]->get_coefficient_fpu_accesses();
+  double sfu_coeff = proc->cores[0]->get_coefficient_sfu_accesses();
+
+  initpower_coeff[INT_ACC]= proc->cores[0]->get_coefficient_ialu_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  if(tot_fpu_accesses != 0){
+    initpower_coeff[FP_ACC]= fp_coeff * sample_perf_counters[FP_ACC]/tot_fpu_accesses;
+    initpower_coeff[DP_ACC]= fp_coeff * sample_perf_counters[DP_ACC]/tot_fpu_accesses;
+  }
+  else{
+    initpower_coeff[FP_ACC]= 0;
+    initpower_coeff[DP_ACC]= 0;
+  }
+
+  if(tot_sfu_accesses != 0){
+    initpower_coeff[INT_MUL24_ACC]= sfu_coeff * sample_perf_counters[INT_MUL24_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_MUL32_ACC]= sfu_coeff * sample_perf_counters[INT_MUL32_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_MUL_ACC]= sfu_coeff * sample_perf_counters[INT_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_DIV_ACC]= sfu_coeff * sample_perf_counters[INT_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[DP_MUL_ACC]= sfu_coeff * sample_perf_counters[DP_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[DP_DIV_ACC]= sfu_coeff * sample_perf_counters[DP_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_MUL_ACC]= sfu_coeff * sample_perf_counters[FP_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_DIV_ACC]= sfu_coeff * sample_perf_counters[FP_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_SQRT_ACC]= sfu_coeff * sample_perf_counters[FP_SQRT_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_LG_ACC]= sfu_coeff * sample_perf_counters[FP_LG_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_SIN_ACC]= sfu_coeff * sample_perf_counters[FP_SIN_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_EXP_ACC]= sfu_coeff * sample_perf_counters[FP_EXP_ACC]/tot_sfu_accesses;
+    initpower_coeff[TENSOR_ACC]= sfu_coeff * sample_perf_counters[TENSOR_ACC]/tot_sfu_accesses;
+    initpower_coeff[TEX_ACC]= sfu_coeff * sample_perf_counters[TEX_ACC]/tot_sfu_accesses;
+  }
+  else{
+    initpower_coeff[INT_MUL24_ACC]= 0;
+    initpower_coeff[INT_MUL32_ACC]= 0;
+    initpower_coeff[INT_MUL_ACC]= 0;
+    initpower_coeff[INT_DIV_ACC]= 0;
+    initpower_coeff[DP_MUL_ACC]= 0;
+    initpower_coeff[DP_DIV_ACC]= 0;
+    initpower_coeff[FP_MUL_ACC]= 0;
+    initpower_coeff[FP_DIV_ACC]= 0;
+    initpower_coeff[FP_SQRT_ACC]= 0;
+    initpower_coeff[FP_LG_ACC]= 0;
+    initpower_coeff[FP_SIN_ACC]= 0;
+    initpower_coeff[FP_EXP_ACC]= 0;
+    initpower_coeff[TENSOR_ACC]= 0;
+    initpower_coeff[TEX_ACC]= 0;
+  }
+
+  effpower_coeff[INT_ACC]= initpower_coeff[INT_ACC];
+  effpower_coeff[FP_ACC]= initpower_coeff[FP_ACC];
+  effpower_coeff[DP_ACC]= initpower_coeff[DP_ACC];
+  effpower_coeff[INT_MUL24_ACC]= initpower_coeff[INT_MUL24_ACC];
+  effpower_coeff[INT_MUL32_ACC]= initpower_coeff[INT_MUL32_ACC];
+  effpower_coeff[INT_MUL_ACC]= initpower_coeff[INT_MUL_ACC];
+  effpower_coeff[INT_DIV_ACC]= initpower_coeff[INT_DIV_ACC];
+  effpower_coeff[DP_MUL_ACC]= initpower_coeff[DP_MUL_ACC];
+  effpower_coeff[DP_DIV_ACC]= initpower_coeff[DP_DIV_ACC];
+  effpower_coeff[FP_MUL_ACC]= initpower_coeff[FP_MUL_ACC];
+  effpower_coeff[FP_DIV_ACC]= initpower_coeff[FP_DIV_ACC];
+  effpower_coeff[FP_SQRT_ACC]= initpower_coeff[FP_SQRT_ACC];
+  effpower_coeff[FP_LG_ACC]= initpower_coeff[FP_LG_ACC];
+  effpower_coeff[FP_SIN_ACC]= initpower_coeff[FP_SIN_ACC];
+  effpower_coeff[FP_EXP_ACC]= initpower_coeff[FP_EXP_ACC];
+  effpower_coeff[TENSOR_ACC]= initpower_coeff[TENSOR_ACC];
+  effpower_coeff[TEX_ACC]= initpower_coeff[TEX_ACC];
+
+  initpower_coeff[NOC_A]=proc->get_coefficient_noc_accesses();
+  effpower_coeff[NOC_A]=initpower_coeff[NOC_A]*p->sys.scaling_coefficients[NOC_A];
+
+  //const_dynamic_power=proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+
+  for(unsigned i=0; i<num_perf_counters; i++){
+    initpower_coeff[i]/=(proc->cores[0]->executionTime);
+    effpower_coeff[i]/=(proc->cores[0]->executionTime);
+  }
+}
+
+double gpgpu_sim_wrapper::calculate_static_power(){ 
+	double int_accesses = initpower_coeff[INT_ACC] + initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
+	double int_add_accesses = initpower_coeff[INT_ACC];
+	double int_mul_accesses = initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
+	double fp_accesses = initpower_coeff[FP_ACC] + initpower_coeff[FP_MUL_ACC] + initpower_coeff[FP_DIV_ACC];
+	double dp_accesses = initpower_coeff[DP_ACC] + initpower_coeff[DP_MUL_ACC] + initpower_coeff[DP_DIV_ACC];
+	double sfu_accesses = initpower_coeff[FP_SQRT_ACC] + initpower_coeff[FP_LG_ACC] + initpower_coeff[FP_SIN_ACC] + initpower_coeff[FP_EXP_ACC];
+	double tensor_accesses = initpower_coeff[TENSOR_ACC];
+	double tex_accesses = initpower_coeff[TEX_ACC];
+	double total_static_power = 0.0;
+	double base_static_power = 0.0; 
+	double lane_static_power = 0.0;
+	double per_active_core = (num_cores - num_idle_cores)/num_cores;
+
+
+	double l1_accesses = initpower_coeff[DC_RH] + initpower_coeff[DC_RM] + initpower_coeff[DC_WH] + initpower_coeff[DC_WM];
+	double l2_accesses = initpower_coeff[L2_RH] + initpower_coeff[L2_RM] + initpower_coeff[L2_WH] + initpower_coeff[L2_WM];
+	double shared_accesses = initpower_coeff[SHRD_ACC];
+
+
+	if(avg_threads_per_warp == 0){ //no functional unit threads, check for memory or a 'LIGHT_SM'
+		if(l1_accesses != 0.0)
+			return (p->sys.static_l1_flane*per_active_core);
+		else if(shared_accesses != 0.0)
+			return (p->sys.static_shared_flane*per_active_core);
+		else if(l2_accesses != 0.0)
+			return (p->sys.static_l2_flane*per_active_core);
+		else //LIGHT_SM
+			return (p->sys.static_light_flane*per_active_core); //return LIGHT_SM base static power
+	}
+
+	/* using a linear model for thread divergence */
+	if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses != 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_DP */
+		base_static_power = p->sys.static_cat3_flane;
+		lane_static_power = p->sys.static_cat3_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses != 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_TENSOR */
+		base_static_power = p->sys.static_cat6_flane;
+		lane_static_power = p->sys.static_cat6_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses != 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_SFU */
+		base_static_power = p->sys.static_cat4_flane;
+		lane_static_power = p->sys.static_cat4_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses != 0.0)){
+		/* INT_FP_TEX */
+		base_static_power = p->sys.static_cat5_flane;
+		lane_static_power = p->sys.static_cat5_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP */
+		base_static_power = p->sys.static_cat2_flane;
+		lane_static_power = p->sys.static_cat2_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses == 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT */
+		/* Seperating INT_ADD only and INT_MUL only from mix of INT instructions */
+		if((int_add_accesses != 0.0) && (int_mul_accesses == 0.0)){ //INT_ADD
+			base_static_power = p->sys.static_intadd_flane;
+			lane_static_power = p->sys.static_intadd_addlane;
+		}
+		else if((int_add_accesses == 0.0) && (int_mul_accesses != 0.0)){ //INT_MUL
+			base_static_power = p->sys.static_intmul_flane;
+			lane_static_power = p->sys.static_intmul_addlane;
+		}
+		else{ //INT_ADD+MUL
+			base_static_power = p->sys.static_cat1_flane;
+			lane_static_power = p->sys.static_cat1_addlane;
+		}
+	}
+
+	else if((int_accesses == 0.0) && (fp_accesses == 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* LIGHT_SM or memory only sample */
+		lane_static_power = 0.0; //addlane static power is 0 for l1/l2/shared memory only accesses
+		if(l1_accesses != 0.0)
+			base_static_power = p->sys.static_l1_flane;
+		else if(shared_accesses != 0.0)
+			base_static_power = p->sys.static_shared_flane;
+		else if(l2_accesses != 0.0)
+			base_static_power = p->sys.static_l2_flane;
+		else{
+			base_static_power = p->sys.static_light_flane;
+			lane_static_power = p->sys.static_light_addlane;
+		}
+	}
+	else{
+		base_static_power = p->sys.static_geomean_flane; //GEOMEAN except LIGHT_SM if we don't fall into any of the categories above
+		lane_static_power = p->sys.static_geomean_addlane;
+	}
+
+	total_static_power = base_static_power + (((double)avg_threads_per_warp-1.0)*lane_static_power); //Linear Model
+	return (total_static_power*per_active_core);
+}
+
+void gpgpu_sim_wrapper::update_components_power()
+{
+
+  update_coefficients();
+
+  proc_power=proc->rt_power.readOp.dynamic;
+  sample_cmp_pwr[IBP]=(proc->cores[0]->ifu->IB->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic
+          +proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic)/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[ICP]=proc->cores[0]->ifu->icache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[DCP]=proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[TCP]=proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[CCP]=proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[SHRDP]=proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[RFP]=(proc->cores[0]->exu->rfu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime))
+         *(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  double sample_fp_pwr = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic/(proc->cores[0]->executionTime));
+
+  double sample_sfu_pwr = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic/(proc->cores[0]->executionTime));
+
+  sample_cmp_pwr[INTP]=(proc->cores[0]->exu->exeu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime))
+         *(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  
+  if(tot_fpu_accesses != 0){
+    sample_cmp_pwr[FPUP]= sample_fp_pwr * sample_perf_counters[FP_ACC]/tot_fpu_accesses;
+    sample_cmp_pwr[DPUP]= sample_fp_pwr * sample_perf_counters[DP_ACC]/tot_fpu_accesses;
+  }
+  else{
+    sample_cmp_pwr[FPUP]= 0;
+    sample_cmp_pwr[DPUP]= 0;
+  }
+  if(tot_sfu_accesses != 0){
+    sample_cmp_pwr[INT_MUL24P]= sample_sfu_pwr * sample_perf_counters[INT_MUL24_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_MUL32P]= sample_sfu_pwr * sample_perf_counters[INT_MUL32_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_MULP]= sample_sfu_pwr * sample_perf_counters[INT_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_DIVP]= sample_sfu_pwr * sample_perf_counters[INT_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_MULP]= sample_sfu_pwr * sample_perf_counters[FP_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_DIVP]= sample_sfu_pwr * sample_perf_counters[FP_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_SQRTP]= sample_sfu_pwr * sample_perf_counters[FP_SQRT_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_LGP]= sample_sfu_pwr * sample_perf_counters[FP_LG_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_SINP]= sample_sfu_pwr * sample_perf_counters[FP_SIN_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_EXP]= sample_sfu_pwr * sample_perf_counters[FP_EXP_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[DP_MULP]= sample_sfu_pwr * sample_perf_counters[DP_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[DP_DIVP]= sample_sfu_pwr * sample_perf_counters[DP_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[TENSORP]= sample_sfu_pwr * sample_perf_counters[TENSOR_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[TEXP]= sample_sfu_pwr * sample_perf_counters[TEX_ACC]/tot_sfu_accesses;
+  }
+  else{
+    sample_cmp_pwr[INT_MUL24P]= 0;
+    sample_cmp_pwr[INT_MUL32P]= 0;
+    sample_cmp_pwr[INT_MULP]= 0;
+    sample_cmp_pwr[INT_DIVP]= 0;
+    sample_cmp_pwr[FP_MULP]= 0;
+    sample_cmp_pwr[FP_DIVP]= 0;
+    sample_cmp_pwr[FP_SQRTP]= 0;
+    sample_cmp_pwr[FP_LGP]= 0;
+    sample_cmp_pwr[FP_SINP]= 0;
+    sample_cmp_pwr[FP_EXP]= 0;
+    sample_cmp_pwr[DP_MULP]= 0;
+    sample_cmp_pwr[DP_DIVP]= 0;
+    sample_cmp_pwr[TENSORP]= 0;
+    sample_cmp_pwr[TEXP]= 0;
+  }
+
+  sample_cmp_pwr[SCHEDP]=proc->cores[0]->exu->scheu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[L2CP]=(proc->XML->sys.number_of_L2s>0)? proc->l2array[0]->rt_power.readOp.dynamic/(proc->cores[0]->executionTime):0;
+
+  sample_cmp_pwr[MCP]=(proc->mc->rt_power.readOp.dynamic-proc->mc->dram->rt_power.readOp.dynamic)/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[NOCP]=proc->nocs[0]->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[DRAMP]=proc->mc->dram->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[PIPEP]=proc->cores[0]->Pipeline_energy/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[IDLE_COREP]=proc->cores[0]->IdleCoreEnergy/(proc->cores[0]->executionTime);
+
+  // This constant dynamic power (e.g., clock power) part is estimated via regression model.
+  sample_cmp_pwr[CONSTP]=0;
+  sample_cmp_pwr[STATICP]=0;
+  // double cnst_dyn = proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+  // // If the regression scaling term is greater than the recorded constant dynamic power
+  // // then use the difference (other portion already added to dynamic power). Else,
+  // // all the constant dynamic power is accounted for, add nothing.
+  // if(p->sys.scaling_coefficients[constant_power] > cnst_dyn)
+  //   sample_cmp_pwr[CONSTP] = (p->sys.scaling_coefficients[constant_power]-cnst_dyn);
+  sample_cmp_pwr[CONSTP] = p->sys.scaling_coefficients[constant_power];
+  sample_cmp_pwr[STATICP] = calculate_static_power();
+
+  if(g_dvfs_enabled){
+  	double voltage_ratio = modeled_chip_voltage/p->sys.modeled_chip_voltage_ref; 
+  	sample_cmp_pwr[IDLE_COREP] *= voltage_ratio; // static power scaled by voltage_ratio
+  	sample_cmp_pwr[STATICP] *= voltage_ratio;  // static power scaled by voltage_ratio
+  	for(unsigned i=0; i<num_pwr_cmps; i++){
+    	if((i != IDLE_COREP) && (i != STATICP)){ 
+    		sample_cmp_pwr[i] *= voltage_ratio*voltage_ratio; // dynamic power scaled by square of voltage_ratio
+    	}
+  	}
+  }
+  
+  proc_power+=sample_cmp_pwr[CONSTP]+sample_cmp_pwr[STATICP];
+  if(!g_dvfs_enabled){ // sanity check will fail when voltage scaling is applied, fix later
+	  double sum_pwr_cmp=0;
+	  for(unsigned i=0; i<num_pwr_cmps; i++){
+	    sum_pwr_cmp+=sample_cmp_pwr[i];
+	  }
+	  bool check=false;
+	  check=sanity_check(sum_pwr_cmp,proc_power);
+	  if(!check)
+	    printf("sum_pwr_cmp %f : proc_power %f \n",sum_pwr_cmp,proc_power);
+	  assert("Total Power does not equal the sum of the components\n" && (check));
+  }
+}
+
+void gpgpu_sim_wrapper::compute() { proc->compute(); }
+void gpgpu_sim_wrapper::print_power_kernel_stats(
+    double gpu_sim_cycle, double gpu_tot_sim_cycle, double init_value,
+    const std::string& kernel_info_string, bool print_trace) {
+  detect_print_steady_state(1, init_value);
+  if (g_power_simulation_enabled) {
+    powerfile << kernel_info_string << std::endl;
+
+    sanity_check((kernel_power.avg * kernel_sample_count), kernel_tot_power);
+    powerfile << "Kernel Average Power Data:" << std::endl;
+    powerfile << "kernel_avg_power = " << kernel_power.avg << std::endl;
+
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_avg_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].avg / kernel_sample_count << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_avg_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].avg / kernel_sample_count
+                << std::endl;
+    }
+
+    powerfile << "gpu_avg_threads_per_warp = "
+                << avg_threads_per_warp_tot / (double)kernel_sample_count
+                << std::endl;
+
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_tot_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].avg
+                << std::endl;
+    }
+
+    powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
+    powerfile << "kernel_max_power = " << kernel_power.max << std::endl;
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_max_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].max << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_max_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].max << std::endl;
+    }
+
+    powerfile << std::endl << "Kernel Minimum Power Data:" << std::endl;
+    powerfile << "kernel_min_power = " << kernel_power.min << std::endl;
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_min_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].min << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_min_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].min << std::endl;
+    }
+
+    powerfile << std::endl
+              << "Accumulative Power Statistics Over Previous Kernels:"
+              << std::endl;
+    powerfile << "gpu_tot_avg_power = "
+              << gpu_tot_power.avg / total_sample_count << std::endl;
+    powerfile << "gpu_tot_max_power = " << gpu_tot_power.max << std::endl;
+    powerfile << "gpu_tot_min_power = " << gpu_tot_power.min << std::endl;
+    powerfile << std::endl << std::endl;
+    powerfile.flush();
+
+    if (print_trace) {
+      print_trace_files();
+    }
+  }
+}
+void gpgpu_sim_wrapper::dump() {
+  if (g_power_per_cycle_dump) proc->displayEnergy(2, 5);
+}
+
+void gpgpu_sim_wrapper::print_steady_state(int position, double init_val) {
+  double temp_avg = sample_val / (double)samples.size();
+  double temp_ipc = (init_val - init_inst_val) /
+                    (double)(samples.size() * gpu_stat_sample_freq);
+
+  if ((samples.size() >
+       gpu_steady_min_period)) {  // If steady state occurred for some time,
+                                  // print to file
+    has_written_avg = true;
+    gzprintf(steady_state_tacking_file, "%u,%d,%f,%f,", sample_start,
+             total_sample_count, temp_avg, temp_ipc);
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      gzprintf(steady_state_tacking_file, "%f,",
+               samples_counter.at(i) / ((double)samples.size()));
+    }
+    gzprintf(steady_state_tacking_file, "\n");
+  } else {
+    if (!has_written_avg && position)
+      gzprintf(steady_state_tacking_file,
+               "ERROR! Not enough steady state points to generate average\n");
+  }
+
+  sample_start = 0;
+  sample_val = 0;
+  init_inst_val = init_val;
+  samples.clear();
+  samples_counter.clear();
+  pwr_counter.clear();
+  assert(samples.size() == 0);
+}
+
+void gpgpu_sim_wrapper::detect_print_steady_state(int position,
+                                                  double init_val) {
+  // Calculating Average
+  if (g_power_simulation_enabled && g_steady_power_levels_enabled) {
+    steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "a");
+    if (position == 0) {
+      if (samples.size() == 0) {
+        // First sample
+        sample_start = total_sample_count;
+        sample_val = proc->rt_power.readOp.dynamic;
+        init_inst_val = init_val;
+        samples.push_back(proc->rt_power.readOp.dynamic);
+        assert(samples_counter.size() == 0);
+        assert(pwr_counter.size() == 0);
+
+        for (unsigned i = 0; i < (num_perf_counters); ++i) {
+          samples_counter.push_back(sample_perf_counters[i]);
+        }
+
+        for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
+          pwr_counter.push_back(sample_cmp_pwr[i]);
+        }
+        assert(pwr_counter.size() == (double)num_pwr_cmps);
+        assert(samples_counter.size() == (double)num_perf_counters);
+      } else {
+        // Get current average
+        double temp_avg = sample_val / (double)samples.size();
+
+        if (abs(proc->rt_power.readOp.dynamic - temp_avg) <
+            gpu_steady_power_deviation) {  // Value is within threshold
+          sample_val += proc->rt_power.readOp.dynamic;
+          samples.push_back(proc->rt_power.readOp.dynamic);
+          for (unsigned i = 0; i < (num_perf_counters); ++i) {
+            samples_counter.at(i) += sample_perf_counters[i];
+          }
+
+          for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
+            pwr_counter.at(i) += sample_cmp_pwr[i];
+          }
+
+        } else {  // Value exceeds threshold, not considered steady state
+          print_steady_state(position, init_val);
+        }
+      }
+    } else {
+      print_steady_state(position, init_val);
+    }
+    gzclose(steady_state_tacking_file);
+  }
+}
+
+void gpgpu_sim_wrapper::open_files() {
+  if (g_power_simulation_enabled) {
+    if (g_power_trace_enabled) {
+      power_trace_file = gzopen(g_power_trace_filename, "a");
+      metric_trace_file = gzopen(g_metric_trace_filename, "a");
+    }
+  }
+}
+void gpgpu_sim_wrapper::close_files() {
+  if (g_power_simulation_enabled) {
+    if (g_power_trace_enabled) {
+      gzclose(power_trace_file);
+      gzclose(metric_trace_file);
+    }
+  }
+}
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.h b/src/accelwattch/gpgpu_sim_wrapper.h
similarity index 68%
rename from src/gpuwattch/gpgpu_sim_wrapper.h
rename to src/accelwattch/gpgpu_sim_wrapper.h
index 00e4f0746..33c4b72f2 100644
--- a/src/gpuwattch/gpgpu_sim_wrapper.h
+++ b/src/accelwattch/gpgpu_sim_wrapper.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,9 +55,34 @@ struct avg_max_min_counters {
   }
 };
 
+#ifndef COEFF_STRUCT
+#define COEFF_STRUCT
+
+struct PowerscalingCoefficients{
+    double int_coeff;
+    double int_mul_coeff;
+    double int_mul24_coeff;
+    double int_mul32_coeff;
+    double int_div_coeff;
+    double fp_coeff;
+    double dp_coeff;
+    double fp_mul_coeff;
+    double fp_div_coeff;
+    double dp_mul_coeff;
+    double dp_div_coeff;
+    double sqrt_coeff;
+    double log_coeff;
+    double sin_coeff;
+    double exp_coeff;
+    double tensor_coeff;
+    double tex_coeff;
+};
+
+#endif
+
 class gpgpu_sim_wrapper {
  public:
-  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile);
+  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile, int power_simulation_mode, bool dvfs_enabled);
   ~gpgpu_sim_wrapper();
 
   void init_mcpat(char* xmlfile, char* powerfile, char* power_trace_file,
@@ -64,7 +90,9 @@ class gpgpu_sim_wrapper {
                   bool power_sim_enabled, bool trace_enabled,
                   bool steady_state_enabled, bool power_per_cycle_dump,
                   double steady_power_deviation, double steady_min_period,
-                  int zlevel, double init_val, int stat_sample_freq);
+                  int zlevel, double init_val, int stat_sample_freq, int power_sim_mode, 
+                  bool dvfs_enabled, unsigned clock_freq, unsigned num_shaders);
+  void init_mcpat_hw_mode(unsigned gpu_sim_cycle);
   void detect_print_steady_state(int position, double init_val);
   void close_files();
   void open_files();
@@ -72,6 +100,7 @@ class gpgpu_sim_wrapper {
   void dump();
   void print_trace_files();
   void update_components_power();
+  double calculate_static_power();
   void update_coefficients();
   void reset_counters();
   void print_power_kernel_stats(double gpu_sim_cycle, double gpu_tot_sim_cycle,
@@ -79,6 +108,7 @@ class gpgpu_sim_wrapper {
                                 const std::string& kernel_info_string,
                                 bool print_trace);
   void power_metrics_calculations();
+  void set_model_voltage(double model_voltage);
   void set_inst_power(bool clk_gated_lanes, double tot_cycles,
                       double busy_cycles, double tot_inst, double int_inst,
                       double fp_inst, double load_inst, double store_inst,
@@ -92,16 +122,31 @@ class gpgpu_sim_wrapper {
                          double write_accesses, double write_misses);
   void set_l2cache_power(double read_accesses, double read_misses,
                          double write_accesses, double write_misses);
+  void set_num_cores(double num_core);
   void set_idle_core_power(double num_idle_core);
   void set_duty_cycle_power(double duty_cycle);
   void set_mem_ctrl_power(double reads, double writes, double dram_precharge);
   void set_exec_unit_power(double fpu_accesses, double ialu_accesses,
                            double sfu_accesses);
+  void set_int_accesses(double ialu_accesses, double imul24_accesses, 
+                        double imul32_accesses, double imul_accesses, 
+                        double idiv_accesses);
+  void set_dp_accesses(double dpu_accesses, double dpmul_accesses, 
+                       double dpdiv_accesses);
+  void set_fp_accesses(double fpu_accesses, double fpmul_accesses, 
+                       double fpdiv_accesses);
+  void set_trans_accesses(double sqrt_accesses, double log_accesses, 
+                       double sin_accesses, double exp_accesses);
+  void set_tensor_accesses(double tensor_accesses);
+  void set_tex_accesses(double tex_accesses);
+  void set_avg_active_threads(float active_threads);
   void set_active_lanes_power(double sp_avg_active_lane,
                               double sfu_avg_active_lane);
-  void set_NoC_power(double noc_tot_reads, double noc_tot_write);
+  void set_NoC_power(double noc_tot_acc);
   bool sanity_check(double a, double b);
 
+  PowerscalingCoefficients * get_scaling_coeffs();
+
  private:
   void print_steady_state(int position, double init_val);
 
@@ -109,8 +154,10 @@ class gpgpu_sim_wrapper {
   ParseXML* p;
   // power parameters
   double const_dynamic_power;
+  double avg_threads_per_warp_tot;
   double proc_power;
-
+  double num_cores;
+  double num_idle_cores;
   unsigned num_perf_counters;  // # of performance counters
   unsigned num_pwr_cmps;       // # of components modelled
   int kernel_sample_count;     // # of samples per kernel
@@ -140,6 +187,10 @@ class gpgpu_sim_wrapper {
   unsigned sample_start;
   double sample_val;
   double init_inst_val;
+  double tot_sfu_accesses;
+  double tot_fpu_accesses;
+  double modeled_chip_voltage;
+  unsigned avg_threads_per_warp;
   std::vector<double> samples;
   std::vector<double> samples_counter;
   std::vector<double> pwr_counter;
@@ -150,6 +201,8 @@ class gpgpu_sim_wrapper {
   char* g_metric_trace_filename;
   char* g_steady_state_tracking_filename;
   bool g_power_simulation_enabled;
+  int g_power_simulation_mode;
+  bool g_dvfs_enabled;
   bool g_steady_power_levels_enabled;
   bool g_power_trace_enabled;
   bool g_power_per_cycle_dump;
diff --git a/src/gpuwattch/gpgpu_static.xml b/src/accelwattch/gpgpu_static.xml
similarity index 100%
rename from src/gpuwattch/gpgpu_static.xml
rename to src/accelwattch/gpgpu_static.xml
diff --git a/src/gpuwattch/interconnect.cc b/src/accelwattch/interconnect.cc
similarity index 100%
rename from src/gpuwattch/interconnect.cc
rename to src/accelwattch/interconnect.cc
diff --git a/src/gpuwattch/interconnect.h b/src/accelwattch/interconnect.h
similarity index 100%
rename from src/gpuwattch/interconnect.h
rename to src/accelwattch/interconnect.h
diff --git a/src/gpuwattch/iocontrollers.cc b/src/accelwattch/iocontrollers.cc
similarity index 100%
rename from src/gpuwattch/iocontrollers.cc
rename to src/accelwattch/iocontrollers.cc
diff --git a/src/gpuwattch/iocontrollers.h b/src/accelwattch/iocontrollers.h
similarity index 100%
rename from src/gpuwattch/iocontrollers.h
rename to src/accelwattch/iocontrollers.h
diff --git a/src/gpuwattch/logic.cc b/src/accelwattch/logic.cc
similarity index 100%
rename from src/gpuwattch/logic.cc
rename to src/accelwattch/logic.cc
diff --git a/src/gpuwattch/logic.h b/src/accelwattch/logic.h
similarity index 100%
rename from src/gpuwattch/logic.h
rename to src/accelwattch/logic.h
diff --git a/src/gpuwattch/main.cc b/src/accelwattch/main.cc
similarity index 100%
rename from src/gpuwattch/main.cc
rename to src/accelwattch/main.cc
diff --git a/src/gpuwattch/makefile b/src/accelwattch/makefile
similarity index 100%
rename from src/gpuwattch/makefile
rename to src/accelwattch/makefile
diff --git a/src/gpuwattch/mcpat.mk b/src/accelwattch/mcpat.mk
similarity index 97%
rename from src/gpuwattch/mcpat.mk
rename to src/accelwattch/mcpat.mk
index a09c23b4c..ad2d6c299 100644
--- a/src/gpuwattch/mcpat.mk
+++ b/src/accelwattch/mcpat.mk
@@ -1,5 +1,5 @@
 
-OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/accelwattch
 TARGET = mcpat
 SHELL = /bin/sh
 .PHONY: all depend clean
diff --git a/src/gpuwattch/mcpatXeonCore.mk b/src/accelwattch/mcpatXeonCore.mk
similarity index 100%
rename from src/gpuwattch/mcpatXeonCore.mk
rename to src/accelwattch/mcpatXeonCore.mk
diff --git a/src/gpuwattch/memoryctrl.cc b/src/accelwattch/memoryctrl.cc
similarity index 100%
rename from src/gpuwattch/memoryctrl.cc
rename to src/accelwattch/memoryctrl.cc
diff --git a/src/gpuwattch/memoryctrl.h b/src/accelwattch/memoryctrl.h
similarity index 100%
rename from src/gpuwattch/memoryctrl.h
rename to src/accelwattch/memoryctrl.h
diff --git a/src/gpuwattch/noc.cc b/src/accelwattch/noc.cc
similarity index 100%
rename from src/gpuwattch/noc.cc
rename to src/accelwattch/noc.cc
diff --git a/src/gpuwattch/noc.h b/src/accelwattch/noc.h
similarity index 100%
rename from src/gpuwattch/noc.h
rename to src/accelwattch/noc.h
diff --git a/src/gpuwattch/processor.cc b/src/accelwattch/processor.cc
similarity index 99%
rename from src/gpuwattch/processor.cc
rename to src/accelwattch/processor.cc
index fc6db463d..9e7f5b2c5 100644
--- a/src/gpuwattch/processor.cc
+++ b/src/accelwattch/processor.cc
@@ -30,11 +30,13 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
+
 #include "processor.h"
 #include <assert.h>
 #include <stdio.h>
@@ -118,7 +120,7 @@ Processor::Processor(ParseXML *XML_interface)
       set_pppm(pppm_t, cores[i]->clockRate * procdynp.numCore, procdynp.numCore,
                procdynp.numCore, procdynp.numCore);
       // set the exClockRate
-      exClockRate = cores[0]->clockRate * 2;  // TODO; get from XML file
+      exClockRate = cores[0]->clockRate;  // TODO; get from XML file
       // cout<<"****EX clock rate:"<<exClockRate<<endl;
       core.power = core.power + cores[i]->power * pppm_t;
       set_pppm(pppm_t, 1 / cores[i]->executionTime, procdynp.numCore,
diff --git a/src/gpuwattch/processor.h b/src/accelwattch/processor.h
similarity index 100%
rename from src/gpuwattch/processor.h
rename to src/accelwattch/processor.h
diff --git a/src/gpuwattch/quadro.xml b/src/accelwattch/quadro.xml
similarity index 100%
rename from src/gpuwattch/quadro.xml
rename to src/accelwattch/quadro.xml
diff --git a/src/gpuwattch/results/Alpha21364 b/src/accelwattch/results/Alpha21364
similarity index 100%
rename from src/gpuwattch/results/Alpha21364
rename to src/accelwattch/results/Alpha21364
diff --git a/src/gpuwattch/results/Alpha21364_90nm b/src/accelwattch/results/Alpha21364_90nm
similarity index 100%
rename from src/gpuwattch/results/Alpha21364_90nm
rename to src/accelwattch/results/Alpha21364_90nm
diff --git a/src/gpuwattch/results/Penryn b/src/accelwattch/results/Penryn
similarity index 100%
rename from src/gpuwattch/results/Penryn
rename to src/accelwattch/results/Penryn
diff --git a/src/gpuwattch/results/T1 b/src/accelwattch/results/T1
similarity index 100%
rename from src/gpuwattch/results/T1
rename to src/accelwattch/results/T1
diff --git a/src/gpuwattch/results/T1_DC_64 b/src/accelwattch/results/T1_DC_64
similarity index 100%
rename from src/gpuwattch/results/T1_DC_64
rename to src/accelwattch/results/T1_DC_64
diff --git a/src/gpuwattch/results/T1_SBT_64 b/src/accelwattch/results/T1_SBT_64
similarity index 100%
rename from src/gpuwattch/results/T1_SBT_64
rename to src/accelwattch/results/T1_SBT_64
diff --git a/src/gpuwattch/results/T1_ST_64 b/src/accelwattch/results/T1_ST_64
similarity index 100%
rename from src/gpuwattch/results/T1_ST_64
rename to src/accelwattch/results/T1_ST_64
diff --git a/src/gpuwattch/results/T2 b/src/accelwattch/results/T2
similarity index 100%
rename from src/gpuwattch/results/T2
rename to src/accelwattch/results/T2
diff --git a/src/gpuwattch/results/Xeon_core b/src/accelwattch/results/Xeon_core
similarity index 100%
rename from src/gpuwattch/results/Xeon_core
rename to src/accelwattch/results/Xeon_core
diff --git a/src/gpuwattch/results/Xeon_uncore b/src/accelwattch/results/Xeon_uncore
similarity index 100%
rename from src/gpuwattch/results/Xeon_uncore
rename to src/accelwattch/results/Xeon_uncore
diff --git a/src/gpuwattch/sharedcache.cc b/src/accelwattch/sharedcache.cc
similarity index 100%
rename from src/gpuwattch/sharedcache.cc
rename to src/accelwattch/sharedcache.cc
diff --git a/src/gpuwattch/sharedcache.h b/src/accelwattch/sharedcache.h
similarity index 100%
rename from src/gpuwattch/sharedcache.h
rename to src/accelwattch/sharedcache.h
diff --git a/src/gpuwattch/technology_xeon_core.cc b/src/accelwattch/technology_xeon_core.cc
similarity index 100%
rename from src/gpuwattch/technology_xeon_core.cc
rename to src/accelwattch/technology_xeon_core.cc
diff --git a/src/gpuwattch/version.h b/src/accelwattch/version.h
similarity index 100%
rename from src/gpuwattch/version.h
rename to src/accelwattch/version.h
diff --git a/src/gpuwattch/xmlParser.cc b/src/accelwattch/xmlParser.cc
similarity index 100%
rename from src/gpuwattch/xmlParser.cc
rename to src/accelwattch/xmlParser.cc
diff --git a/src/gpuwattch/xmlParser.h b/src/accelwattch/xmlParser.h
similarity index 100%
rename from src/gpuwattch/xmlParser.h
rename to src/accelwattch/xmlParser.h
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 71f0703ac..f9e5db314 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan, Jimmy Kwa
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
+// George L. Yuan, Jimmy Kwa, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -588,65 +589,119 @@ void ptx_instruction::set_fp_or_int_archop() {
       oprnd_type = INT_OP;
   }
 }
-void ptx_instruction::set_mul_div_or_other_archop() {
-  sp_op = OTHER_OP;
-  if ((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) &&
-      (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) &&
-      (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) &&
-      (m_opcode != CALL_OP)) {
-    if (get_type() == F32_TYPE || get_type() == F64_TYPE ||
-        get_type() == FF64_TYPE) {
-      switch (get_opcode()) {
-        case MUL_OP:
-        case MAD_OP:
-          sp_op = FP_MUL_OP;
-          break;
-        case DIV_OP:
-          sp_op = FP_DIV_OP;
-          break;
-        case LG2_OP:
-          sp_op = FP_LG_OP;
-          break;
-        case RSQRT_OP:
-        case SQRT_OP:
-          sp_op = FP_SQRT_OP;
-          break;
-        case RCP_OP:
-          sp_op = FP_DIV_OP;
-          break;
-        case SIN_OP:
-        case COS_OP:
-          sp_op = FP_SIN_OP;
-          break;
-        case EX2_OP:
-          sp_op = FP_EXP_OP;
-          break;
-        default:
-          if ((op == ALU_OP) || (op == TENSOR_CORE_OP)) sp_op = FP__OP;
-          break;
+
+void ptx_instruction::set_mul_div_or_other_archop(){
+  sp_op=OTHER_OP;
+  if((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) && (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) && (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) && (m_opcode != CALL_OP)){
+    if(get_type() == F64_TYPE || get_type() == FF64_TYPE){
+         switch(get_opcode()){
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+                sp_op=DP_MUL_OP;
+               break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=DP_DIV_OP;
+               break;
+            case RCP_OP:
+                sp_op=DP_DIV_OP;
+               break;
+            case LG2_OP:
+                sp_op=FP_LG_OP;
+               break;
+            case RSQRT_OP:
+            case SQRT_OP:
+                sp_op=FP_SQRT_OP;
+               break;            
+            case SIN_OP:
+            case COS_OP:
+                sp_op=FP_SIN_OP;
+               break;
+            case EX2_OP:
+                sp_op=FP_EXP_OP;
+               break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==DP_OP) || (op==ALU_OP))
+                  sp_op=DP___OP;
+               break;
+         }
       }
-    } else {
-      switch (get_opcode()) {
-        case MUL24_OP:
-        case MAD24_OP:
-          sp_op = INT_MUL24_OP;
-          break;
-        case MUL_OP:
-        case MAD_OP:
-          if (get_type() == U32_TYPE || get_type() == S32_TYPE ||
-              get_type() == B32_TYPE)
-            sp_op = INT_MUL32_OP;
-          else
-            sp_op = INT_MUL_OP;
-          break;
-        case DIV_OP:
-          sp_op = INT_DIV_OP;
-          break;
-        default:
-          if ((op == ALU_OP)) sp_op = INT__OP;
-          break;
+      else if(get_type()==F16_TYPE || get_type()==F32_TYPE){
+         switch(get_opcode()){
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+                sp_op=FP_MUL_OP;
+               break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=FP_DIV_OP;
+               break;
+            case RCP_OP:
+                sp_op=FP_DIV_OP;
+               break;
+            case LG2_OP:
+                sp_op=FP_LG_OP;
+               break;
+            case RSQRT_OP:
+            case SQRT_OP:
+                sp_op=FP_SQRT_OP;
+               break;            
+            case SIN_OP:
+            case COS_OP:
+                sp_op=FP_SIN_OP;
+               break;
+            case EX2_OP:
+                sp_op=FP_EXP_OP;
+               break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==SP_OP) || (op==ALU_OP))
+                  sp_op=FP__OP;
+               break;
+         }
+      }else {
+         switch(get_opcode()){
+            case MUL24_OP:
+            case MAD24_OP:
+                sp_op=INT_MUL24_OP;
+            break;
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+               if(get_type()==U32_TYPE || get_type()==S32_TYPE || get_type()==B32_TYPE)
+                   sp_op=INT_MUL32_OP;
+               else
+                   sp_op=INT_MUL_OP;
+            break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=INT_DIV_OP;
+            break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==INTP_OP) || (op==ALU_OP))
+                   sp_op=INT__OP;
+               break;
+         }
       }
-    }
   }
 }
 
@@ -880,6 +935,7 @@ void ptx_instruction::set_opcode_and_latency() {
     case MAD_OP:
     case MADC_OP:
     case MADP_OP:
+    case FMA_OP:
       // MAD latency
       switch (get_type()) {
         case F32_TYPE:
@@ -903,7 +959,18 @@ void ptx_instruction::set_opcode_and_latency() {
           break;
       }
       break;
+    case MUL24_OP: //MUL24 is performed on mul32 units (with additional instructions for bitmasking) on devices with compute capability >1.x
+      latency = int_latency[2]+1;
+      initiation_interval = int_init[2]+1;
+      op = INTP_OP;
+      break;
+    case MAD24_OP:
+      latency = int_latency[3]+1;
+      initiation_interval = int_init[3]+1;
+      op = INTP_OP;
+      break;
     case DIV_OP:
+    case REM_OP:
       // Floating point only
       op = SFU_OP;
       switch (get_type()) {
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 0b990e83c..44afbe5aa 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Jimmy Kwa, George L. Yuan
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// Jimmy Kwa, George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
+
 #include "instructions.h"
 #include "half.h"
 #include "half.hpp"
@@ -3977,7 +3979,7 @@ void mad_def(const ptx_instruction *pI, ptx_thread_info *thread,
           fesetround(FE_TOWARDZERO);
           break;
         default:
-          assert(0);
+          //assert(0);
           break;
       }
       d.f32 = a.f32 * b.f32 + c.f32;
@@ -4323,11 +4325,8 @@ void mul_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
     case S64_TYPE:
       t.s64 = a.s64 * b.s64;
       assert(!pI->is_wide());
-      assert(!pI->is_hi());
-      if (pI->is_lo())
-        d.s64 = t.s64;
-      else
-        assert(0);
+      //assert(!pI->is_hi());
+      d.s64 = t.s64;
       break;
     case U16_TYPE:
       t.u32 = ((unsigned)a.u16) * ((unsigned)b.u16);
diff --git a/src/cuda-sim/ptx.l b/src/cuda-sim/ptx.l
index 675404597..7706f0b31 100644
--- a/src/cuda-sim/ptx.l
+++ b/src/cuda-sim/ptx.l
@@ -1,32 +1,34 @@
 /*
-Copyright (c) 2009-2011, Tor M. Aamodt
-The University of British Columbia
+Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+The University of British Columbia, Northwestern University
 All rights reserved.
-
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
-Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-Redistributions in binary form must reproduce the above copyright notice, this
-list of conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution.
-Neither the name of The University of British Columbia nor the names of its
-contributors may be used to endorse or promote products derived from this
-software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of The University of British Columbia, Northwestern 
+   University nor the names of their contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 */
 
+
 %option nounput
 %option noyywrap
 %option yylineno
@@ -69,6 +71,7 @@ andn	TC; yylval->int_value = ANDN_OP; return OPCODE;
 atom	TC; yylval->int_value = ATOM_OP; return OPCODE;
 bar.warp 	TC; yylval->int_value = NOP_OP; return OPCODE;
 bar 	TC; yylval->int_value = BAR_OP; return OPCODE;
+barrier	TC; yylval->int_value = BAR_OP; return OPCODE;
 bfe     TC; yylval->int_value = BFE_OP; return OPCODE;
 bfi     TC; yylval->int_value = BFI_OP; return OPCODE;
 bfind   TC; yylval->int_value = BFIND_OP; return OPCODE;
@@ -167,14 +170,22 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 "CPTX_END"	printf("ENDING CUSTOM PTX.\n"); BEGIN(IN_COMMENT);
 
 <INITIAL,NOT_OPCODE,IN_INST,IN_FUNC_DECL>{
-\.a\.sync TC; yylval->int_value = LOAD_A; return WMMA_DIRECTIVE;
-\.b\.sync TC; yylval->int_value = LOAD_B; return WMMA_DIRECTIVE;
-\.c\.sync TC; yylval->int_value = LOAD_C; return WMMA_DIRECTIVE;
-\.d\.sync TC; yylval->int_value = STORE_D; return WMMA_DIRECTIVE;
-\.mma\.sync TC;yylval->int_value=MMA; return WMMA_DIRECTIVE;
+\.a\.sync\.aligned TC; yylval->int_value = LOAD_A; return WMMA_DIRECTIVE;
+\.b\.sync\.aligned TC; yylval->int_value = LOAD_B; return WMMA_DIRECTIVE;
+\.c\.sync\.aligned TC; yylval->int_value = LOAD_C; return WMMA_DIRECTIVE;
+\.d\.sync\.aligned TC; yylval->int_value = STORE_D; return WMMA_DIRECTIVE;
+\.mma\.sync\.aligned TC;yylval->int_value=MMA; return WMMA_DIRECTIVE;
 
 \.row TC; yylval->int_value = ROW; return LAYOUT;
 \.col TC; yylval->int_value = COL; return LAYOUT;
+\.m16n16k16\.global TC; yylval->int_value = M16N16K16; return CONFIGURATION;
+\.m32n8k16\.global TC; yylval->int_value = M32N8K16; return CONFIGURATION;
+\.m8n32k16\.global TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
+
+\.m16n16k16\.shared TC; yylval->int_value = M16N16K16; return CONFIGURATION;
+\.m32n8k16\.shared TC; yylval->int_value = M32N8K16; return CONFIGURATION;
+\.m8n32k16\.shared TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
+
 \.m16n16k16 TC; yylval->int_value = M16N16K16; return CONFIGURATION;
 \.m32n8k16 TC; yylval->int_value = M32N8K16; return CONFIGURATION;
 \.m8n32k16 TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
@@ -476,4 +487,4 @@ int ptx_error( yyscan_t yyscanner, ptx_recognizer* recognizer, const char *s )
 	fflush(stdout);
 	//exit(1);
 	return 0;
-}
+}
\ No newline at end of file
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index d3da4b541..2edc1ed56 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
+// George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -1384,6 +1385,8 @@ ptx_instruction::ptx_instruction(
       case CS_OPTION:
       case LU_OPTION:
       case CV_OPTION:
+      case WB_OPTION: 
+      case WT_OPTION:
         m_cache_option = last_ptx_inst_option;
         break;
       case HALF_OPTION:
diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index ca47c4684..545c45dfd 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Ivan Sham, George L. Yuan,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// Ivan Sham, George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -855,7 +856,7 @@ void dram_t::visualizer_print(gzFile visualizer_file) {
 
 void dram_t::set_dram_power_stats(unsigned &cmd, unsigned &activity,
                                   unsigned &nop, unsigned &act, unsigned &pre,
-                                  unsigned &rd, unsigned &wr,
+                                  unsigned &rd, unsigned &wr, unsigned &wr_WB,
                                   unsigned &req) const {
   // Point power performance counters to low-level DRAM counters
   cmd = n_cmd;
@@ -865,6 +866,7 @@ void dram_t::set_dram_power_stats(unsigned &cmd, unsigned &activity,
   pre = n_pre;
   rd = n_rd;
   wr = n_wr;
+  wr_WB = n_wr_WB;
   req = n_req;
 }
 
diff --git a/src/gpgpu-sim/dram.h b/src/gpgpu-sim/dram.h
index 6c212e9be..88e46ed7b 100644
--- a/src/gpgpu-sim/dram.h
+++ b/src/gpgpu-sim/dram.h
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
-// George L. Yuan, Wilson W.L. Fung
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
+// George L. Yuan, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -135,7 +136,7 @@ class dram_t {
   // Power Model
   void set_dram_power_stats(unsigned &cmd, unsigned &activity, unsigned &nop,
                             unsigned &act, unsigned &pre, unsigned &rd,
-                            unsigned &wr, unsigned &req) const;
+                            unsigned &wr, unsigned &wr_WB, unsigned &req) const;
 
   const memory_config *m_config;
 
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 7416246f0..a2aeec57f 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -642,6 +643,7 @@ void cache_stats::clear() {
   ///
   for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
     std::fill(m_stats[i].begin(), m_stats[i].end(), 0);
+    std::fill(m_stats_pw[i].begin(), m_stats_pw[i].end(), 0);
     std::fill(m_fail_stats[i].begin(), m_fail_stats[i].end(), 0);
   }
   m_cache_port_available_cycles = 0;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 67d084cbf..498dfebd0 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 56ede056c..e44551ee3 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
-// Ali Bakhoda, Andrew Turner, Ivan Sham
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
+// Ali Bakhoda, Andrew Turner, Ivan Sham, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -95,10 +96,11 @@ tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
 #include "mem_latency_stat.h"
 
+
 void power_config::reg_options(class OptionParser *opp) {
-  option_parser_register(opp, "-gpuwattch_xml_file", OPT_CSTR,
-                         &g_power_config_name, "GPUWattch XML file",
-                         "gpuwattch.xml");
+  option_parser_register(opp, "-accelwattch_xml_file", OPT_CSTR,
+                         &g_power_config_name, "AccelWattch XML file",
+                         "accelwattch_sass_sim.xml");
 
   option_parser_register(opp, "-power_simulation_enabled", OPT_BOOL,
                          &g_power_simulation_enabled,
@@ -108,6 +110,92 @@ void power_config::reg_options(class OptionParser *opp) {
                          &g_power_per_cycle_dump,
                          "Dump detailed power output each cycle", "0");
 
+
+
+
+  option_parser_register(opp, "-hw_perf_file_name", OPT_CSTR,
+                         &g_hw_perf_file_name, "Hardware Performance Statistics file",
+                         "hw_perf.csv");
+
+  option_parser_register(opp, "-hw_perf_bench_name", OPT_CSTR,
+                         &g_hw_perf_bench_name, "Kernel Name in Hardware Performance Statistics file",
+                         "");
+
+  option_parser_register(opp, "-power_simulation_mode", OPT_INT32,
+                         &g_power_simulation_mode,
+                         "Switch performance counter input for power simulation (0=Sim, 1=HW, 2=HW-Sim Hybrid)", "0");
+
+  option_parser_register(opp, "-dvfs_enabled", OPT_BOOL,
+                         &g_dvfs_enabled,
+                         "Turn on DVFS for power model", "0");
+  option_parser_register(opp, "-aggregate_power_stats", OPT_BOOL,
+                         &g_aggregate_power_stats,
+                         "Accumulate power across all kernels", "0");
+
+  //Accelwattch Hyrbid Configuration
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_RH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_RH],
+                         "Get L1 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_RM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_RM],
+                         "Get L1 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_WH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_WH],
+                         "Get L1 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_WM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_WM],
+                         "Get L1 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_RH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_RH],
+                         "Get L2 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_RM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_RM],
+                         "Get L2 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_WH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_WH],
+                         "Get L2 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_WM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_WM],
+                         "Get L2 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_CC_ACC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_CC_ACC],
+                         "Get Constant Cache Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_SHARED_ACC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_SHRD_ACC],
+                         "Get Shared Memory Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_RD", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_DRAM_RD],
+                         "Get DRAM Reads for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_WR", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_DRAM_WR],
+                         "Get DRAM Writes for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_NOC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_NOC],
+                         "Get Interconnect Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_PIPE_DUTY", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_PIPE_DUTY],
+                         "Get Pipeline Duty Cycle Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_NUM_SM_IDLE", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_NUM_SM_IDLE],
+                         "Get Number of Idle SMs for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_CYCLES", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_CYCLES],
+                         "Get Executed Cycles for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_VOLTAGE", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_VOLTAGE],
+                         "Get Chip Voltage for Accelwattch-Hybrid from Accel-Sim", "0");
+
+
   // Output Data Formats
   option_parser_register(
       opp, "-power_trace_enabled", OPT_BOOL, &g_power_trace_enabled,
@@ -835,7 +923,7 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
 
 #ifdef GPGPUSIM_POWER_MODEL
   m_gpgpusim_wrapper = new gpgpu_sim_wrapper(config.g_power_simulation_enabled,
-                                             config.g_power_config_name);
+                                             config.g_power_config_name, config.g_power_simulation_mode, config.g_dvfs_enabled);
 #endif
 
   m_shader_stats = new shader_core_stats(m_shader_config);
@@ -1010,6 +1098,14 @@ void gpgpu_sim::init() {
   partiton_reqs_in_parallel_util = 0;
   gpu_sim_cycle_parition_util = 0;
 
+// McPAT initialization function. Called on first launch of GPU
+#ifdef GPGPUSIM_POWER_MODEL
+  if (m_config.g_power_simulation_enabled) {
+    init_mcpat(m_config, m_gpgpusim_wrapper, m_config.gpu_stat_sample_freq,
+               gpu_tot_sim_insn, gpu_sim_insn);
+  }
+#endif
+
   reinit_clock_domains();
   gpgpu_ctx->func_sim->set_param_gpgpu_num_shaders(m_config.num_shader());
   for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
@@ -1035,14 +1131,6 @@ void gpgpu_sim::init() {
   }
 
   if (g_network_mode) icnt_init();
-
-    // McPAT initialization function. Called on first launch of GPU
-#ifdef GPGPUSIM_POWER_MODEL
-  if (m_config.g_power_simulation_enabled) {
-    init_mcpat(m_config, m_gpgpusim_wrapper, m_config.gpu_stat_sample_freq,
-               gpu_tot_sim_insn, gpu_sim_insn);
-  }
-#endif
 }
 
 void gpgpu_sim::update_stats() {
@@ -1067,6 +1155,11 @@ void gpgpu_sim::update_stats() {
   gpu_occupancy = occupancy_stats();
 }
 
+PowerscalingCoefficients *gpgpu_sim::get_scaling_coeffs()
+{
+  return m_gpgpusim_wrapper->get_scaling_coeffs();
+}
+
 void gpgpu_sim::print_stats() {
   gpgpu_ctx->stats->ptx_file_line_stats_write_file();
   gpu_print_stat();
@@ -1146,6 +1239,18 @@ std::string gpgpu_sim::executed_kernel_info_string() {
 
   return statout.str();
 }
+
+std::string gpgpu_sim::executed_kernel_name() {
+  std::stringstream statout;  
+  if( m_executed_kernel_names.size() == 1)
+     statout << m_executed_kernel_names[0];
+  else{
+    for (unsigned int k = 0; k < m_executed_kernel_names.size(); k++) {
+      statout << m_executed_kernel_names[k] << " ";
+    }
+  }
+  return statout.str();
+}
 void gpgpu_sim::set_cache_config(std::string kernel_name,
                                  FuncCache cacheConfig) {
   m_special_cache_config[kernel_name] = cacheConfig;
@@ -1326,10 +1431,20 @@ void gpgpu_sim::gpu_print_stat() {
   m_shader_stats->print(stdout);
 #ifdef GPGPUSIM_POWER_MODEL
   if (m_config.g_power_simulation_enabled) {
+    if(m_config.g_power_simulation_mode > 0){
+        //if(!m_config.g_aggregate_power_stats)
+          mcpat_reset_perf_count(m_gpgpusim_wrapper);
+        calculate_hw_mcpat(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                  m_power_stats, m_config.gpu_stat_sample_freq,
+                  gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
+                  gpu_sim_insn, m_config.g_power_simulation_mode, m_config.g_dvfs_enabled, 
+                  m_config.g_hw_perf_file_name, m_config.g_hw_perf_bench_name, executed_kernel_name(), m_config.accelwattch_hybrid_configuration, m_config.g_aggregate_power_stats);
+    }
     m_gpgpusim_wrapper->print_power_kernel_stats(
         gpu_sim_cycle, gpu_tot_sim_cycle, gpu_tot_sim_insn + gpu_sim_insn,
         kernel_info_str, true);
-    mcpat_reset_perf_count(m_gpgpusim_wrapper);
+    //if(!m_config.g_aggregate_power_stats)
+      mcpat_reset_perf_count(m_gpgpusim_wrapper);
   }
 #endif
 
@@ -1796,6 +1911,7 @@ void gpgpu_sim::cycle() {
           m_power_stats->pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_req[CURRENT_STAT_IDX][i]);
     }
   }
@@ -1839,7 +1955,7 @@ void gpgpu_sim::cycle() {
         m_cluster[i]->core_cycle();
         *active_sms += m_cluster[i]->get_n_active_sms();
       }
-      // Update core icnt/cache stats for GPUWattch
+      // Update core icnt/cache stats for AccelWattch
       m_cluster[i]->get_icnt_stats(
           m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
@@ -1869,10 +1985,12 @@ void gpgpu_sim::cycle() {
       // McPAT main cycle (interface with McPAT)
 #ifdef GPGPUSIM_POWER_MODEL
     if (m_config.g_power_simulation_enabled) {
+      if(m_config.g_power_simulation_mode == 0){
       mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
                   m_power_stats, m_config.gpu_stat_sample_freq,
                   gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
-                  gpu_sim_insn);
+                  gpu_sim_insn, m_config.g_dvfs_enabled);
+      }
     }
 #endif
 
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 2e6820d82..68b3dfa10 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #ifndef GPU_SIM_H
 #define GPU_SIM_H
 
@@ -68,6 +70,29 @@ extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
+enum hw_perf_t {
+  HW_BENCH_NAME=0,
+  HW_KERNEL_NAME,
+  HW_L1_RH,
+  HW_L1_RM,
+  HW_L1_WH,
+  HW_L1_WM,
+  HW_CC_ACC,
+  HW_SHRD_ACC,
+  HW_DRAM_RD,
+  HW_DRAM_WR,
+  HW_L2_RH,
+  HW_L2_RM,
+  HW_L2_WH,
+  HW_L2_WM,
+  HW_NOC,
+  HW_PIPE_DUTY,
+  HW_NUM_SM_IDLE,
+  HW_CYCLES,
+  HW_VOLTAGE,
+  HW_TOTAL_STATS
+};
+
 struct power_config {
   power_config() { m_valid = true; }
   void init() {
@@ -82,7 +107,8 @@ struct power_config {
       s++;
     }
     char buf1[1024];
-    snprintf(buf1, 1024, "gpgpusim_power_report__%s.log", date);
+    //snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
+    snprintf(buf1, 1024, "accelwattch_power_report.log");
     g_power_filename = strdup(buf1);
     char buf2[1024];
     snprintf(buf2, 1024, "gpgpusim_power_trace_report__%s.log.gz", date);
@@ -94,6 +120,9 @@ struct power_config {
     snprintf(buf4, 1024, "gpgpusim_steady_state_tracking_report__%s.log.gz",
              date);
     g_steady_state_tracking_filename = strdup(buf4);
+    // for(int i =0; i< hw_perf_t::HW_TOTAL_STATS; i++){
+    //   accelwattch_hybrid_configuration[i] = 0;
+    // }
 
     if (g_steady_power_levels_enabled) {
       sscanf(gpu_steady_state_definition, "%lf:%lf",
@@ -125,6 +154,14 @@ struct power_config {
   double gpu_steady_power_deviation;
   double gpu_steady_min_period;
 
+
+  char *g_hw_perf_file_name;
+  char *g_hw_perf_bench_name;
+  int g_power_simulation_mode;
+  bool g_dvfs_enabled;
+  bool g_aggregate_power_stats;
+  bool accelwattch_hybrid_configuration[hw_perf_t::HW_TOTAL_STATS];
+
   // Nonlinear power model
   bool g_use_nonlinear_model;
   char *gpu_nonlinear_model_config;
@@ -357,7 +394,7 @@ class gpgpu_sim_config : public power_config,
 
     m_valid = true;
   }
-
+  unsigned get_core_freq() const { return core_freq; }
   unsigned num_shader() const { return m_shader_config.num_shader(); }
   unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
   unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; }
@@ -527,6 +564,7 @@ class gpgpu_sim : public gpgpu_t {
   bool kernel_more_cta_left(kernel_info_t *kernel) const;
   bool hit_max_cta_count() const;
   kernel_info_t *select_kernel();
+  PowerscalingCoefficients *get_scaling_coeffs();
   void decrement_kernel_latency();
 
   const gpgpu_sim_config &get_config() const { return m_config; }
@@ -634,6 +672,7 @@ class gpgpu_sim : public gpgpu_t {
 
   std::string executed_kernel_info_string();  //< format the kernel information
                                               // into a string for stat printout
+  std::string executed_kernel_name();
   void clear_executed_kernel_info();  //< clear the kernel information after
                                       // stat printout
   virtual void createSIMTCluster() = 0;
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index f1c761fe5..511c15efa 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -388,9 +389,9 @@ void memory_partition_unit::set_done(mem_fetch *mf) {
 
 void memory_partition_unit::set_dram_power_stats(
     unsigned &n_cmd, unsigned &n_activity, unsigned &n_nop, unsigned &n_act,
-    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_req) const {
+    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB, unsigned &n_req) const {
   m_dram->set_dram_power_stats(n_cmd, n_activity, n_nop, n_act, n_pre, n_rd,
-                               n_wr, n_req);
+                               n_wr, n_wr_WB, n_req);
 }
 
 void memory_partition_unit::print(FILE *fp) const {
@@ -664,6 +665,7 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
   unsigned pre = 0;
   unsigned rd = 0;
   unsigned wr = 0;
+  unsigned wr_WB = 0;
   unsigned req = 0;
   unsigned tot_cmd = 0;
   unsigned tot_nop = 0;
@@ -675,13 +677,13 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
 
   for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
     m_memory_partition_unit[i]->set_dram_power_stats(cmd, activity, nop, act,
-                                                     pre, rd, wr, req);
+                                                     pre, rd, wr, wr_WB, req);
     tot_cmd += cmd;
     tot_nop += nop;
     tot_act += act;
     tot_pre += pre;
     tot_rd += rd;
-    tot_wr += wr;
+    tot_wr += wr + wr_WB;
     tot_req += req;
   }
   fprintf(fout, "gpgpu_n_dram_reads = %d\n", tot_rd);
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index beed76562..902a4b7c0 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -95,7 +96,7 @@ class memory_partition_unit {
   // Power model
   void set_dram_power_stats(unsigned &n_cmd, unsigned &n_activity,
                             unsigned &n_nop, unsigned &n_act, unsigned &n_pre,
-                            unsigned &n_rd, unsigned &n_wr,
+                            unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB,
                             unsigned &n_req) const;
 
   int global_sub_partition_id_to_local_id(int global_sub_partition_id) const;
diff --git a/src/gpgpu-sim/power_interface.cc b/src/gpgpu-sim/power_interface.cc
index c637d846f..63b985260 100644
--- a/src/gpgpu-sim/power_interface.cc
+++ b/src/gpgpu-sim/power_interface.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,8 +27,10 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #include "power_interface.h"
 
+
 void init_mcpat(const gpgpu_sim_config &config,
                 class gpgpu_sim_wrapper *wrapper, unsigned stat_sample_freq,
                 unsigned tot_inst, unsigned inst) {
@@ -38,7 +41,11 @@ void init_mcpat(const gpgpu_sim_config &config,
       config.g_power_simulation_enabled, config.g_power_trace_enabled,
       config.g_steady_power_levels_enabled, config.g_power_per_cycle_dump,
       config.gpu_steady_power_deviation, config.gpu_steady_min_period,
-      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq);
+      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq,  
+      config.g_power_simulation_mode, 
+      config.g_dvfs_enabled,
+      config.get_core_freq()/1000000,
+      config.num_shader());
 }
 
 void mcpat_cycle(const gpgpu_sim_config &config,
@@ -46,7 +53,7 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  class gpgpu_sim_wrapper *wrapper,
                  class power_stat_t *power_stats, unsigned stat_sample_freq,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst) {
+                 unsigned inst, bool dvfs_enabled) {
   static bool mcpat_init = true;
 
   if (mcpat_init) {  // If first cycle, don't have any power numbers yet
@@ -55,41 +62,45 @@ void mcpat_cycle(const gpgpu_sim_config &config,
   }
 
   if ((tot_cycle + cycle) % stat_sample_freq == 0) {
+    if(dvfs_enabled){
+      wrapper->set_model_voltage(1); //performance model needs to support this.
+    }
+
     wrapper->set_inst_power(
         shdr_config->gpgpu_clock_gated_lanes, stat_sample_freq,
-        stat_sample_freq, power_stats->get_total_inst(),
-        power_stats->get_total_int_inst(), power_stats->get_total_fp_inst(),
-        power_stats->get_l1d_read_accesses(),
-        power_stats->get_l1d_write_accesses(),
-        power_stats->get_committed_inst());
+        stat_sample_freq, power_stats->get_total_inst(0),
+        power_stats->get_total_int_inst(0), power_stats->get_total_fp_inst(0),
+        power_stats->get_l1d_read_accesses(0),
+        power_stats->get_l1d_write_accesses(0),
+        power_stats->get_committed_inst(0));
 
     // Single RF for both int and fp ops
-    wrapper->set_regfile_power(power_stats->get_regfile_reads(),
-                               power_stats->get_regfile_writes(),
-                               power_stats->get_non_regfile_operands());
+    wrapper->set_regfile_power(power_stats->get_regfile_reads(0),
+                               power_stats->get_regfile_writes(0),
+                               power_stats->get_non_regfile_operands(0));
 
     // Instruction cache stats
-    wrapper->set_icache_power(power_stats->get_inst_c_hits(),
-                              power_stats->get_inst_c_misses());
+    wrapper->set_icache_power(power_stats->get_inst_c_hits(0),
+                              power_stats->get_inst_c_misses(0));
 
     // Constant Cache, shared memory, texture cache
-    wrapper->set_ccache_power(power_stats->get_constant_c_hits(),
-                              power_stats->get_constant_c_misses());
+    wrapper->set_ccache_power(power_stats->get_const_accessess(0), 0); //assuming all HITS in constant cache for now
     wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
                               power_stats->get_texture_c_misses());
-    wrapper->set_shrd_mem_power(power_stats->get_shmem_read_access());
+    wrapper->set_shrd_mem_power(power_stats->get_shmem_access(0));
 
     wrapper->set_l1cache_power(
-        power_stats->get_l1d_read_hits(), power_stats->get_l1d_read_misses(),
-        power_stats->get_l1d_write_hits(), power_stats->get_l1d_write_misses());
+        power_stats->get_l1d_read_hits(0), power_stats->get_l1d_read_misses(0),
+        power_stats->get_l1d_write_hits(0), power_stats->get_l1d_write_misses(0));
 
     wrapper->set_l2cache_power(
-        power_stats->get_l2_read_hits(), power_stats->get_l2_read_misses(),
-        power_stats->get_l2_write_hits(), power_stats->get_l2_write_misses());
+        power_stats->get_l2_read_hits(0), power_stats->get_l2_read_misses(0),
+        power_stats->get_l2_write_hits(0), power_stats->get_l2_write_misses(0));
 
     float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
     float num_cores = shdr_config->num_shader();
     float num_idle_core = num_cores - active_sms;
+    wrapper->set_num_cores(num_cores);
     wrapper->set_idle_core_power(num_idle_core);
 
     // pipeline power - pipeline_duty_cycle *= percent_active_sms;
@@ -101,38 +112,64 @@ void mcpat_cycle(const gpgpu_sim_config &config,
     wrapper->set_duty_cycle_power(pipeline_duty_cycle);
 
     // Memory Controller
-    wrapper->set_mem_ctrl_power(power_stats->get_dram_rd(),
-                                power_stats->get_dram_wr(),
-                                power_stats->get_dram_pre());
+    wrapper->set_mem_ctrl_power(power_stats->get_dram_rd(0),
+                                power_stats->get_dram_wr(0),
+                                power_stats->get_dram_pre(0));
 
     // Execution pipeline accesses
     // FPU (SP) accesses, Integer ALU (not present in Tesla), Sfu accesses
-    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(),
-                                 power_stats->get_ialu_accessess(),
-                                 power_stats->get_tot_sfu_accessess());
+
+    wrapper->set_int_accesses(power_stats->get_ialu_accessess(0), 
+                              power_stats->get_intmul24_accessess(0), 
+                              power_stats->get_intmul32_accessess(0), 
+                              power_stats->get_intmul_accessess(0), 
+                              power_stats->get_intdiv_accessess(0));
+
+    wrapper->set_dp_accesses(power_stats->get_dp_accessess(0), 
+                              power_stats->get_dpmul_accessess(0), 
+                              power_stats->get_dpdiv_accessess(0));
+
+    wrapper->set_fp_accesses(power_stats->get_fp_accessess(0), 
+                            power_stats->get_fpmul_accessess(0), 
+                            power_stats->get_fpdiv_accessess(0));
+
+    wrapper->set_trans_accesses(power_stats->get_sqrt_accessess(0), 
+                                power_stats->get_log_accessess(0), 
+                                power_stats->get_sin_accessess(0), 
+                                power_stats->get_exp_accessess(0));
+
+    wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(0));
+
+    wrapper->set_tex_accesses(power_stats->get_tex_accessess(0));
+
+    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(0),
+                                 power_stats->get_ialu_accessess(0),
+                                 power_stats->get_tot_sfu_accessess(0));
+
+    wrapper->set_avg_active_threads(power_stats->get_active_threads(0));
 
     // Average active lanes for sp and sfu pipelines
     float avg_sp_active_lanes =
         (power_stats->get_sp_active_lanes()) / stat_sample_freq;
     float avg_sfu_active_lanes =
         (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+    if(avg_sp_active_lanes >32.0 )
+      avg_sp_active_lanes = 32.0;
+    if(avg_sfu_active_lanes >32.0 )
+      avg_sfu_active_lanes = 32.0;
     assert(avg_sp_active_lanes <= 32);
     assert(avg_sfu_active_lanes <= 32);
-    wrapper->set_active_lanes_power(
-        (power_stats->get_sp_active_lanes()) / stat_sample_freq,
-        (power_stats->get_sfu_active_lanes()) / stat_sample_freq);
+    wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
 
     double n_icnt_simt_to_mem =
         (double)
-            power_stats->get_icnt_simt_to_mem();  // # flits from SIMT clusters
+            power_stats->get_icnt_simt_to_mem(0);  // # flits from SIMT clusters
                                                   // to memory partitions
     double n_icnt_mem_to_simt =
         (double)
-            power_stats->get_icnt_mem_to_simt();  // # flits from memory
+            power_stats->get_icnt_mem_to_simt(0);  // # flits from memory
                                                   // partitions to SIMT clusters
-    wrapper->set_NoC_power(
-        n_icnt_mem_to_simt,
-        n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
+    wrapper->set_NoC_power(n_icnt_mem_to_simt + n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
 
     wrapper->compute();
 
@@ -152,3 +189,336 @@ void mcpat_cycle(const gpgpu_sim_config &config,
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper) {
   wrapper->reset_counters();
 }
+
+bool parse_hw_file(char* hwpowerfile, bool find_target_kernel, vector<string> &hw_data, char* benchname, std::string executed_kernelname){
+  fstream hw_file;
+  hw_file.open(hwpowerfile, ios::in);
+  string line, word, temp;
+  while(!hw_file.eof()){
+    hw_data.clear();
+    getline(hw_file, line);
+    stringstream s(line);
+    while (getline(s,word,',')){
+      hw_data.push_back(word);
+    }
+    if(hw_data[HW_BENCH_NAME] == std::string(benchname)){
+      if(find_target_kernel){
+        if(hw_data[HW_KERNEL_NAME] == ""){
+          hw_file.close();
+          return true;
+        }
+        else{
+          if(hw_data[HW_KERNEL_NAME] == executed_kernelname){
+            hw_file.close();
+            return true;
+          }
+        }
+      }
+      else{
+        hw_file.close();
+        return true;
+      }
+    } 
+  }
+  hw_file.close();
+  return false;
+}
+
+
+void calculate_hw_mcpat(const gpgpu_sim_config &config,
+                 const shader_core_config *shdr_config,
+                 class gpgpu_sim_wrapper *wrapper,
+                 class power_stat_t *power_stats, unsigned stat_sample_freq,
+                 unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
+                 unsigned inst, int power_simulation_mode, bool dvfs_enabled, char* hwpowerfile, 
+                 char* benchname, std::string executed_kernelname, 
+                 const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats){
+
+  /* Reading HW data from CSV file */
+
+  vector<string> hw_data;
+  bool kernel_found = false;
+  kernel_found = parse_hw_file(hwpowerfile, true, hw_data, benchname, executed_kernelname); //Searching for matching executed_kernelname.
+  if(!kernel_found)
+    kernel_found = parse_hw_file(hwpowerfile, false, hw_data, benchname, executed_kernelname); //Searching for any kernel with same benchname. 
+  assert("Could not find perf stats for the target benchmark in hwpowerfile.\n" && (kernel_found));
+  unsigned perf_cycles = static_cast<unsigned int>(std::stod(hw_data[HW_CYCLES]) + 0.5);
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_CYCLES]))
+    perf_cycles = cycle;
+  wrapper->init_mcpat_hw_mode(perf_cycles); //total PERF MODEL cycles for current kernel
+
+  if(dvfs_enabled){
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_VOLTAGE])) 
+      wrapper->set_model_voltage(1); //performance model needs to support this
+    else  
+      wrapper->set_model_voltage(std::stod(hw_data[HW_VOLTAGE])); //performance model needs to support this
+  }
+
+  double l1_read_hits = std::stod(hw_data[HW_L1_RH]);
+  double l1_read_misses = std::stod(hw_data[HW_L1_RM]);
+  double l1_write_hits = std::stod(hw_data[HW_L1_WH]);
+  double l1_write_misses = std::stod(hw_data[HW_L1_WM]);
+
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_RH]))
+    l1_read_hits = power_stats->get_l1d_read_hits(1) - power_stats->l1r_hits_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_RM]))
+    l1_read_misses = power_stats->get_l1d_read_misses(1) - power_stats->l1r_misses_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WH]))
+    l1_write_hits = power_stats->get_l1d_write_hits(1) - power_stats->l1w_hits_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WM]))
+    l1_write_misses = power_stats->get_l1d_write_misses(1) - power_stats->l1w_misses_kernel;
+
+    if(aggregate_power_stats){
+      power_stats->tot_inst_execution += power_stats->get_total_inst(1);
+      power_stats->tot_int_inst_execution +=  power_stats->get_total_int_inst(1);
+      power_stats->tot_fp_inst_execution +=  power_stats->get_total_fp_inst(1);
+      power_stats->commited_inst_execution += power_stats->get_committed_inst(1);
+      wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
+        cycle, power_stats->tot_inst_execution,
+        power_stats->tot_int_inst_execution, power_stats->tot_fp_inst_execution,
+        l1_read_hits + l1_read_misses,
+        l1_write_hits + l1_write_misses,
+        power_stats->commited_inst_execution);
+    }
+    else{
+    wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
+        cycle, power_stats->get_total_inst(1),
+        power_stats->get_total_int_inst(1), power_stats->get_total_fp_inst(1),
+        l1_read_hits + l1_read_misses,
+        l1_write_hits + l1_write_misses,
+        power_stats->get_committed_inst(1));
+    }
+
+    // Single RF for both int and fp ops -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for register files
+    wrapper->set_regfile_power(power_stats->get_regfile_reads(1),
+                               power_stats->get_regfile_writes(1),
+                               power_stats->get_non_regfile_operands(1));
+
+    // Instruction cache stats -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for instruction cache
+    wrapper->set_icache_power(power_stats->get_inst_c_hits(1) - power_stats->l1i_hits_kernel,
+                              power_stats->get_inst_c_misses(1) - power_stats->l1i_misses_kernel);
+
+    // Constant Cache, shared memory, texture cache
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_CC_ACC]))
+      wrapper->set_ccache_power(power_stats->get_const_accessess(1) - power_stats->cc_accesses_kernel, 0); //assuming all HITS in constant cache for now
+    else  
+      wrapper->set_ccache_power(std::stod(hw_data[HW_CC_ACC]), 0); //assuming all HITS in constant cache for now
+
+    
+    // wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
+    //                           power_stats->get_texture_c_misses());
+
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_SHRD_ACC]))
+      wrapper->set_shrd_mem_power(power_stats->get_shmem_access(1) - power_stats->shared_accesses_kernel);
+    else  
+      wrapper->set_shrd_mem_power(std::stod(hw_data[HW_SHRD_ACC]));
+
+    wrapper->set_l1cache_power( l1_read_hits,  l1_read_misses, l1_write_hits,  l1_write_misses);
+
+    double l2_read_hits = std::stod(hw_data[HW_L2_RH]);
+    double l2_read_misses = std::stod(hw_data[HW_L2_RM]);
+    double l2_write_hits = std::stod(hw_data[HW_L2_WH]);
+    double l2_write_misses = std::stod(hw_data[HW_L2_WM]);
+
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_RH]))
+      l2_read_hits = power_stats->get_l2_read_hits(1) - power_stats->l2r_hits_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_RM]))
+      l2_read_misses = power_stats->get_l2_read_misses(1)  - power_stats->l2r_misses_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_WH]))
+      l2_write_hits = power_stats->get_l2_write_hits(1) - power_stats->l2w_hits_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_WM]))
+      l2_write_misses = power_stats->get_l2_write_misses(1) - power_stats->l2w_misses_kernel;
+
+    wrapper->set_l2cache_power(l2_read_hits, l2_read_misses, l2_write_hits, l2_write_misses);
+    
+    float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
+    float num_cores = shdr_config->num_shader();
+    float num_idle_core = num_cores - active_sms;
+    wrapper->set_num_cores(num_cores);
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_NUM_SM_IDLE]))
+      wrapper->set_idle_core_power(num_idle_core);
+    else 
+      wrapper->set_idle_core_power(std::stod(hw_data[HW_NUM_SM_IDLE])); 
+
+    float pipeline_duty_cycle =
+        ((*power_stats->m_average_pipeline_duty_cycle / (stat_sample_freq)) <
+         0.8)
+            ? ((*power_stats->m_average_pipeline_duty_cycle) / stat_sample_freq)
+            : 0.8;
+    
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_PIPE_DUTY]))
+      wrapper->set_duty_cycle_power(pipeline_duty_cycle);
+    else
+      wrapper->set_duty_cycle_power(std::stod(hw_data[HW_PIPE_DUTY]));
+
+    // Memory Controller
+  
+    double dram_reads = std::stod(hw_data[HW_DRAM_RD]);
+    double dram_writes = std::stod(hw_data[HW_DRAM_WR]);
+    double dram_pre = 0;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+      dram_reads = power_stats->get_dram_rd(1) - power_stats->dram_rd_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_WR]))
+      dram_writes = power_stats->get_dram_wr(1) - power_stats->dram_wr_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+      dram_pre = power_stats->get_dram_pre(1) - power_stats->dram_pre_kernel;
+
+
+    wrapper->set_mem_ctrl_power(dram_reads, dram_writes, dram_pre);
+
+    if(aggregate_power_stats){
+      power_stats->ialu_acc_execution += power_stats->get_ialu_accessess(1);
+      power_stats->imul24_acc_execution += power_stats->get_intmul24_accessess(1);
+      power_stats->imul32_acc_execution += power_stats->get_intmul32_accessess(1);
+      power_stats->imul_acc_execution += power_stats->get_intmul_accessess(1);
+      power_stats->idiv_acc_execution += power_stats->get_intdiv_accessess(1);
+      power_stats->dp_acc_execution += power_stats->get_dp_accessess(1);
+      power_stats->dpmul_acc_execution += power_stats->get_dpmul_accessess(1);
+      power_stats->dpdiv_acc_execution += power_stats->get_dpdiv_accessess(1);
+      power_stats->fp_acc_execution += power_stats->get_fp_accessess(1);
+      power_stats->fpmul_acc_execution += power_stats->get_fpmul_accessess(1);
+      power_stats->fpdiv_acc_execution += power_stats->get_fpdiv_accessess(1);
+      power_stats->sqrt_acc_execution += power_stats->get_sqrt_accessess(1);
+      power_stats->log_acc_execution += power_stats->get_log_accessess(1);
+      power_stats->sin_acc_execution += power_stats->get_sin_accessess(1);
+      power_stats->exp_acc_execution += power_stats->get_exp_accessess(1);
+      power_stats->tensor_acc_execution += power_stats->get_tensor_accessess(1);
+      power_stats->tex_acc_execution += power_stats->get_tex_accessess(1);
+      power_stats->tot_fpu_acc_execution += power_stats->get_tot_fpu_accessess(1);
+      power_stats->tot_sfu_acc_execution += power_stats->get_tot_sfu_accessess(1);
+      power_stats->tot_threads_acc_execution += power_stats->get_tot_threads_kernel(1);
+      power_stats->tot_warps_acc_execution += power_stats->get_tot_warps_kernel(1);
+      
+      power_stats->sp_active_lanes_execution += (power_stats->get_sp_active_lanes() * shdr_config->num_shader() * shdr_config->gpgpu_num_sp_units);
+      power_stats->sfu_active_lanes_execution += (power_stats->get_sfu_active_lanes() * shdr_config->num_shader() * shdr_config->gpgpu_num_sp_units);
+
+      wrapper->set_int_accesses(power_stats->ialu_acc_execution, 
+                                power_stats->imul24_acc_execution, 
+                                power_stats->imul32_acc_execution, 
+                                power_stats->imul_acc_execution, 
+                                power_stats->idiv_acc_execution);
+
+      wrapper->set_dp_accesses(power_stats->dp_acc_execution, 
+                                power_stats->dpmul_acc_execution, 
+                                power_stats->dpdiv_acc_execution);
+
+      wrapper->set_fp_accesses(power_stats->fp_acc_execution, 
+                              power_stats->fpmul_acc_execution, 
+                              power_stats->fpdiv_acc_execution);
+
+      wrapper->set_trans_accesses(power_stats->sqrt_acc_execution, 
+                                  power_stats->log_acc_execution, 
+                                  power_stats->sin_acc_execution, 
+                                  power_stats->exp_acc_execution);
+
+      wrapper->set_tensor_accesses(power_stats->tensor_acc_execution);
+
+      wrapper->set_tex_accesses(power_stats->tex_acc_execution);
+
+      wrapper->set_exec_unit_power(power_stats->ialu_acc_execution,
+                                   power_stats->tot_fpu_acc_execution,
+                                   power_stats->tot_sfu_acc_execution);
+
+      wrapper->set_avg_active_threads((double)((double)power_stats->tot_threads_acc_execution / (double)power_stats->tot_warps_acc_execution));
+
+      // Average active lanes for sp and sfu pipelines
+      float avg_sp_active_lanes =
+          (power_stats->sp_active_lanes_execution) / shdr_config->num_shader() / shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+      float avg_sfu_active_lanes =
+          (power_stats->sfu_active_lanes_execution) / shdr_config->num_shader() / shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+      if(avg_sp_active_lanes >32.0 )
+        avg_sp_active_lanes = 32.0;
+      if(avg_sfu_active_lanes >32.0 )
+        avg_sfu_active_lanes = 32.0;
+      assert(avg_sp_active_lanes <= 32);
+      assert(avg_sfu_active_lanes <= 32);
+      wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+    }
+    else{
+      wrapper->set_int_accesses(power_stats->get_ialu_accessess(1), 
+                                power_stats->get_intmul24_accessess(1), 
+                                power_stats->get_intmul32_accessess(1), 
+                                power_stats->get_intmul_accessess(1), 
+                                power_stats->get_intdiv_accessess(1));
+
+      wrapper->set_dp_accesses(power_stats->get_dp_accessess(1), 
+                                power_stats->get_dpmul_accessess(1), 
+                                power_stats->get_dpdiv_accessess(1));
+
+      wrapper->set_fp_accesses(power_stats->get_fp_accessess(1), 
+                              power_stats->get_fpmul_accessess(1), 
+                              power_stats->get_fpdiv_accessess(1));
+
+      wrapper->set_trans_accesses(power_stats->get_sqrt_accessess(1), 
+                                  power_stats->get_log_accessess(1), 
+                                  power_stats->get_sin_accessess(1), 
+                                  power_stats->get_exp_accessess(1));
+
+      wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(1));
+
+      wrapper->set_tex_accesses(power_stats->get_tex_accessess(1));
+
+      wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(1),
+                                   power_stats->get_ialu_accessess(1),
+                                   power_stats->get_tot_sfu_accessess(1));
+
+      wrapper->set_avg_active_threads(power_stats->get_active_threads(1));
+
+      // Average active lanes for sp and sfu pipelines
+      float avg_sp_active_lanes =
+          (power_stats->get_sp_active_lanes()) / stat_sample_freq;
+      float avg_sfu_active_lanes =
+          (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+      if(avg_sp_active_lanes >32.0 )
+        avg_sp_active_lanes = 32.0;
+      if(avg_sfu_active_lanes >32.0 )
+        avg_sfu_active_lanes = 32.0;
+      assert(avg_sp_active_lanes <= 32);
+      assert(avg_sfu_active_lanes <= 32);
+      wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+    }
+
+  
+    double n_icnt_simt_to_mem =
+      (double)
+          (power_stats->get_icnt_simt_to_mem(1) - power_stats->noc_tr_kernel);  // # flits from SIMT clusters
+                                                // to memory partitions
+    double n_icnt_mem_to_simt =
+      (double)
+          (power_stats->get_icnt_mem_to_simt(1)- power_stats->noc_rc_kernel);  // # flits from memory
+                                                // partitions to SIMT clusters
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_NOC]))   
+      wrapper->set_NoC_power(n_icnt_mem_to_simt + n_icnt_simt_to_mem);  // Number of flits traversing the interconnect from Accel-Sim
+    else
+      wrapper->set_NoC_power(std::stod(hw_data[HW_NOC]));  // Number of flits traversing the interconnect from HW
+   
+    wrapper->compute();
+
+    wrapper->update_components_power();
+
+    wrapper->power_metrics_calculations();
+
+    wrapper->dump();
+    power_stats->l1r_hits_kernel = power_stats->get_l1d_read_hits(1);
+    power_stats->l1r_misses_kernel = power_stats->get_l1d_read_misses(1);
+    power_stats->l1w_hits_kernel = power_stats->get_l1d_write_hits(1);
+    power_stats->l1w_misses_kernel = power_stats->get_l1d_write_misses(1);
+    power_stats->shared_accesses_kernel = power_stats->get_const_accessess(1);
+    power_stats->cc_accesses_kernel = power_stats->get_shmem_access(1);
+    power_stats->dram_rd_kernel = power_stats->get_dram_rd(1);
+    power_stats->dram_wr_kernel = power_stats->get_dram_wr(1);
+    power_stats->dram_pre_kernel = power_stats->get_dram_pre(1);
+    power_stats->l1i_hits_kernel = power_stats->get_inst_c_hits(1);
+    power_stats->l1i_misses_kernel = power_stats->get_inst_c_misses(1);
+    power_stats->l2r_hits_kernel = power_stats->get_l2_read_hits(1);
+    power_stats->l2r_misses_kernel = power_stats->get_l2_read_misses(1);
+    power_stats->l2w_hits_kernel =  power_stats->get_l2_write_hits(1); 
+    power_stats->l2w_misses_kernel = power_stats->get_l2_write_misses(1);
+    power_stats->noc_tr_kernel = power_stats->get_icnt_simt_to_mem(1);
+    power_stats->noc_rc_kernel =  power_stats->get_icnt_mem_to_simt(1);
+
+
+    power_stats->clear();
+}
\ No newline at end of file
diff --git a/src/gpgpu-sim/power_interface.h b/src/gpgpu-sim/power_interface.h
index 2bfd4d504..1a488948c 100644
--- a/src/gpgpu-sim/power_interface.h
+++ b/src/gpgpu-sim/power_interface.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -43,7 +44,19 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  class gpgpu_sim_wrapper *wrapper,
                  class power_stat_t *power_stats, unsigned stat_sample_freq,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst);
+                 unsigned inst, bool dvfs_enabled);
+
+void calculate_hw_mcpat(const gpgpu_sim_config &config,
+                 const shader_core_config *shdr_config,
+                 class gpgpu_sim_wrapper *wrapper,
+                 class power_stat_t *power_stats, unsigned stat_sample_freq,
+                 unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
+                 unsigned inst, int power_simulation_mode, bool dvfs_enabled, 
+                 char* hwpowerfile, char* benchname, std::string executed_kernelname, 
+                 const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats);
+
+bool parse_hw_file(char* hwpowerfile, bool find_target_kernel, vector<string> &hw_data, char* benchname, std::string executed_kernelname);
+
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper);
 
 #endif /* POWER_INTERFACE_H_ */
diff --git a/src/gpgpu-sim/power_stat.cc b/src/gpgpu-sim/power_stat.cc
index 7b60ddf84..fd7a77560 100644
--- a/src/gpgpu-sim/power_stat.cc
+++ b/src/gpgpu-sim/power_stat.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,10 +55,64 @@ power_mem_stat_t::power_mem_stat_t(const memory_config *mem_config,
   init();
 }
 
+void power_stat_t::clear(){
+  for(unsigned i=0; i< NUM_STAT_IDX; ++i){
+    pwr_mem_stat->core_cache_stats[i].clear();
+    pwr_mem_stat->l2_cache_stats[i].clear();
+    for(unsigned j=0; j<m_config->num_shader(); ++j){
+      pwr_core_stat->m_pipeline_duty_cycle[i][j]=0;                
+      pwr_core_stat->m_num_decoded_insn[i][j]=0;
+      pwr_core_stat->m_num_FPdecoded_insn[i][j]=0;
+      pwr_core_stat->m_num_INTdecoded_insn[i][j]=0;
+      pwr_core_stat->m_num_storequeued_insn[i][j]=0;
+      pwr_core_stat->m_num_loadqueued_insn[i][j]=0;
+      pwr_core_stat->m_num_tex_inst[i][j]=0;
+      pwr_core_stat->m_num_ialu_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul24_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul32_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fpmul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_idiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fpdiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dpmul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dpdiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_tensor_core_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_const_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_tex_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sfu_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sqrt_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_log_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sin_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_exp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_mem_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sp_committed[i][j]=0;
+      pwr_core_stat->m_num_sfu_committed[i][j]=0;
+      pwr_core_stat->m_num_mem_committed[i][j]=0;
+      pwr_core_stat->m_read_regfile_acesses[i][j]=0;
+      pwr_core_stat->m_write_regfile_acesses[i][j]=0;
+      pwr_core_stat->m_non_rf_operands[i][j]=0;
+      pwr_core_stat->m_active_sp_lanes[i][j]=0;
+      pwr_core_stat->m_active_sfu_lanes[i][j]=0;
+      pwr_core_stat->m_active_exu_threads[i][j]=0;                   
+      pwr_core_stat->m_active_exu_warps[i][j]=0;
+    }
+    for (unsigned j = 0; j < m_mem_config->m_n_mem; ++j) {
+      pwr_mem_stat->n_rd[i][j]=0;
+      pwr_mem_stat->n_wr[i][j]=0;
+      pwr_mem_stat->n_pre[i][j]=0;
+    }
+  }
+}
+
+
+
 void power_mem_stat_t::init() {
-  shmem_read_access[CURRENT_STAT_IDX] =
+  shmem_access[CURRENT_STAT_IDX] =
       m_core_stats->gpgpu_n_shmem_bank_access;  // Shared memory access
-  shmem_read_access[PREV_STAT_IDX] =
+  shmem_access[PREV_STAT_IDX] =
       (unsigned *)calloc(m_core_config->num_shader(), sizeof(unsigned));
 
   for (unsigned i = 0; i < NUM_STAT_IDX; ++i) {
@@ -71,6 +126,7 @@ void power_mem_stat_t::init() {
     n_pre[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_rd[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_wr[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
+    n_wr_WB[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_req[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
 
     // Interconnect stats
@@ -86,8 +142,8 @@ void power_mem_stat_t::save_stats() {
   l2_cache_stats[PREV_STAT_IDX] = l2_cache_stats[CURRENT_STAT_IDX];
 
   for (unsigned i = 0; i < m_core_config->num_shader(); ++i) {
-    shmem_read_access[PREV_STAT_IDX][i] =
-        shmem_read_access[CURRENT_STAT_IDX][i];  // Shared memory access
+    shmem_access[PREV_STAT_IDX][i] =
+        shmem_access[CURRENT_STAT_IDX][i];  // Shared memory access
   }
 
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
@@ -98,6 +154,7 @@ void power_mem_stat_t::save_stats() {
     n_pre[PREV_STAT_IDX][i] = n_pre[CURRENT_STAT_IDX][i];
     n_rd[PREV_STAT_IDX][i] = n_rd[CURRENT_STAT_IDX][i];
     n_wr[PREV_STAT_IDX][i] = n_wr[CURRENT_STAT_IDX][i];
+    n_wr_WB[PREV_STAT_IDX][i] = n_wr_WB[CURRENT_STAT_IDX][i];
     n_req[PREV_STAT_IDX][i] = n_req[CURRENT_STAT_IDX][i];
   }
 
@@ -117,7 +174,7 @@ void power_mem_stat_t::print(FILE *fout) const {
   unsigned total_mem_writes = 0;
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
     total_mem_reads += n_rd[CURRENT_STAT_IDX][i];
-    total_mem_writes += n_wr[CURRENT_STAT_IDX][i];
+    total_mem_writes += n_wr[CURRENT_STAT_IDX][i] + n_wr_WB[CURRENT_STAT_IDX][i];
   }
   fprintf(fout, "Total memory controller accesses: %u\n",
           total_mem_reads + total_mem_writes);
@@ -147,198 +204,165 @@ void power_core_stat_t::print(FILE *fout) {
   // per core statistics
   fprintf(fout, "Power Metrics: \n");
   for (unsigned i = 0; i < m_config->num_shader(); i++) {
-    fprintf(fout, "core %u:\n", i);
-    fprintf(fout, "\tpipeline duty cycle =%f\n",
-            m_pipeline_duty_cycle[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal Deocded Instructions=%u\n",
-            m_num_decoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FP Deocded Instructions=%u\n",
-            m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal INT Deocded Instructions=%u\n",
-            m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal LOAD Queued Instructions=%u\n",
-            m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal STORE Queued Instructions=%u\n",
-            m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IALU Acesses=%u\n",
-            m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FP Acesses=%u\n",
-            m_num_fp_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL Acesses=%u\n",
-            m_num_imul_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL24 Acesses=%u\n",
-            m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL32 Acesses=%u\n",
-            m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IDIV Acesses=%u\n",
-            m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FPMUL Acesses=%u\n",
-            m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Acesses=%u\n",
-            m_num_trans_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FPDIV Acesses=%u\n",
-            m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Acesses=%u\n",
-            m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SP Acesses=%u\n",
-            m_num_sp_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal MEM Acesses=%u\n",
-            m_num_mem_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Commissions=%u\n",
-            m_num_sfu_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SP Commissions=%u\n",
-            m_num_sp_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal MEM Commissions=%u\n",
-            m_num_mem_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal REG Reads=%u\n",
-            m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal REG Writes=%u\n",
-            m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal NON REG=%u\n",
-            m_non_rf_operands[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"core %u:\n",i);
+        fprintf(fout,"\tpipeline duty cycle =%f\n",m_pipeline_duty_cycle[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal Deocded Instructions=%u\n",m_num_decoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FP Deocded Instructions=%u\n",m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal INT Deocded Instructions=%u\n",m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal LOAD Queued Instructions=%u\n",m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal STORE Queued Instructions=%u\n",m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IALU Acesses=%f\n",m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FP Acesses=%f\n",m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DP Acesses=%f\n",m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL Acesses=%f\n",m_num_imul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL24 Acesses=%f\n",m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL32 Acesses=%f\n",m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IDIV Acesses=%f\n",m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FPMUL Acesses=%f\n",m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DPMUL Acesses=%f\n",m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SQRT Acesses=%f\n",m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal LOG Acesses=%f\n",m_num_log_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SIN Acesses=%f\n",m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal EXP Acesses=%f\n",m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FPDIV Acesses=%f\n",m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DPDIV Acesses=%f\n",m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal TENSOR Acesses=%f\n",m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal CONST Acesses=%f\n",m_num_const_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal TEX Acesses=%f\n",m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SFU Acesses=%f\n",m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SP Acesses=%f\n",m_num_sp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal MEM Acesses=%f\n",m_num_mem_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SFU Commissions=%u\n",m_num_sfu_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SP Commissions=%u\n",m_num_sp_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal MEM Commissions=%u\n",m_num_mem_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal REG Reads=%u\n",m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal REG Writes=%u\n",m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal NON REG=%u\n",m_non_rf_operands[CURRENT_STAT_IDX][i]);
   }
 }
 void power_core_stat_t::init() {
-  m_pipeline_duty_cycle[CURRENT_STAT_IDX] = m_core_stats->m_pipeline_duty_cycle;
-  m_num_decoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_decoded_insn;
-  m_num_FPdecoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_FPdecoded_insn;
-  m_num_INTdecoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_INTdecoded_insn;
-  m_num_storequeued_insn[CURRENT_STAT_IDX] =
-      m_core_stats->m_num_storequeued_insn;
-  m_num_loadqueued_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_loadqueued_insn;
-  m_num_ialu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_ialu_acesses;
-  m_num_fp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fp_acesses;
-  m_num_imul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul_acesses;
-  m_num_imul24_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul24_acesses;
-  m_num_imul32_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul32_acesses;
-  m_num_fpmul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpmul_acesses;
-  m_num_idiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_idiv_acesses;
-  m_num_fpdiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpdiv_acesses;
-  m_num_sp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_acesses;
-  m_num_sfu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_acesses;
-  m_num_trans_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_trans_acesses;
-  m_num_mem_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_acesses;
-  m_num_sp_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_committed;
-  m_num_sfu_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_committed;
-  m_num_mem_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_committed;
-  m_read_regfile_acesses[CURRENT_STAT_IDX] =
-      m_core_stats->m_read_regfile_acesses;
-  m_write_regfile_acesses[CURRENT_STAT_IDX] =
-      m_core_stats->m_write_regfile_acesses;
-  m_non_rf_operands[CURRENT_STAT_IDX] = m_core_stats->m_non_rf_operands;
-  m_active_sp_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sp_lanes;
-  m_active_sfu_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sfu_lanes;
-  m_num_tex_inst[CURRENT_STAT_IDX] = m_core_stats->m_num_tex_inst;
+    m_pipeline_duty_cycle[CURRENT_STAT_IDX]=m_core_stats->m_pipeline_duty_cycle;
+    m_num_decoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_decoded_insn;
+    m_num_FPdecoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_FPdecoded_insn;
+    m_num_INTdecoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_INTdecoded_insn;
+    m_num_storequeued_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_storequeued_insn;
+    m_num_loadqueued_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_loadqueued_insn;
+    m_num_ialu_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_ialu_acesses;
+    m_num_fp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fp_acesses;
+    m_num_imul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul_acesses;
+    m_num_imul24_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul24_acesses;
+    m_num_imul32_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul32_acesses;
+    m_num_fpmul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fpmul_acesses;
+    m_num_idiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_idiv_acesses;
+    m_num_fpdiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fpdiv_acesses;
+    m_num_dp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dp_acesses;
+    m_num_dpmul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dpmul_acesses;
+    m_num_dpdiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dpdiv_acesses;
+    m_num_sp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sp_acesses;
+    m_num_sfu_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sfu_acesses;
+    m_num_sqrt_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sqrt_acesses;
+    m_num_log_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_log_acesses;
+    m_num_sin_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sin_acesses;
+    m_num_exp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_exp_acesses;
+    m_num_tensor_core_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_tensor_core_acesses;
+    m_num_const_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_const_acesses;
+    m_num_tex_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_tex_acesses;
+    m_num_mem_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_mem_acesses;
+    m_num_sp_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_sp_committed;
+    m_num_sfu_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_sfu_committed;
+    m_num_mem_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_mem_committed;
+    m_read_regfile_acesses[CURRENT_STAT_IDX]=m_core_stats->m_read_regfile_acesses;
+    m_write_regfile_acesses[CURRENT_STAT_IDX]=m_core_stats->m_write_regfile_acesses;
+    m_non_rf_operands[CURRENT_STAT_IDX]=m_core_stats->m_non_rf_operands;
+    m_active_sp_lanes[CURRENT_STAT_IDX]=m_core_stats->m_active_sp_lanes;
+    m_active_sfu_lanes[CURRENT_STAT_IDX]=m_core_stats->m_active_sfu_lanes;
+    m_active_exu_threads[CURRENT_STAT_IDX]=m_core_stats->m_active_exu_threads;
+    m_active_exu_warps[CURRENT_STAT_IDX]=m_core_stats->m_active_exu_warps;
+    m_num_tex_inst[CURRENT_STAT_IDX]=m_core_stats->m_num_tex_inst;
+
+    m_pipeline_duty_cycle[PREV_STAT_IDX]=(float*)calloc(m_config->num_shader(),sizeof(float));
+    m_num_decoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_FPdecoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_INTdecoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_storequeued_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_loadqueued_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_tex_inst[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+
+    m_num_ialu_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul24_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul32_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fpmul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_idiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fpdiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dpmul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dpdiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_tensor_core_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_const_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_tex_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sfu_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sqrt_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_log_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sin_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_exp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_mem_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sp_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_sfu_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_mem_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_read_regfile_acesses[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_write_regfile_acesses[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_non_rf_operands[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_sp_lanes[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_sfu_lanes[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_exu_threads[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_active_exu_warps[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+
 
-  m_pipeline_duty_cycle[PREV_STAT_IDX] =
-      (float *)calloc(m_config->num_shader(), sizeof(float));
-  m_num_decoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_FPdecoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_INTdecoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_storequeued_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_loadqueued_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_ialu_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fp_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_tex_inst[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul24_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul32_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fpmul_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_idiv_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fpdiv_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sp_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sfu_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_trans_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_mem_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sp_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sfu_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_mem_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_read_regfile_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_write_regfile_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_non_rf_operands[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_active_sp_lanes[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_active_sfu_lanes[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
 }
 
 void power_core_stat_t::save_stats() {
   for (unsigned i = 0; i < m_config->num_shader(); ++i) {
-    m_pipeline_duty_cycle[PREV_STAT_IDX][i] =
-        m_pipeline_duty_cycle[CURRENT_STAT_IDX][i];
-    m_num_decoded_insn[PREV_STAT_IDX][i] =
-        m_num_decoded_insn[CURRENT_STAT_IDX][i];
-    m_num_FPdecoded_insn[PREV_STAT_IDX][i] =
-        m_num_FPdecoded_insn[CURRENT_STAT_IDX][i];
-    m_num_INTdecoded_insn[PREV_STAT_IDX][i] =
-        m_num_INTdecoded_insn[CURRENT_STAT_IDX][i];
-    m_num_storequeued_insn[PREV_STAT_IDX][i] =
-        m_num_storequeued_insn[CURRENT_STAT_IDX][i];
-    m_num_loadqueued_insn[PREV_STAT_IDX][i] =
-        m_num_loadqueued_insn[CURRENT_STAT_IDX][i];
-    m_num_ialu_acesses[PREV_STAT_IDX][i] =
-        m_num_ialu_acesses[CURRENT_STAT_IDX][i];
-    m_num_fp_acesses[PREV_STAT_IDX][i] = m_num_fp_acesses[CURRENT_STAT_IDX][i];
-    m_num_tex_inst[PREV_STAT_IDX][i] = m_num_tex_inst[CURRENT_STAT_IDX][i];
-    m_num_imul_acesses[PREV_STAT_IDX][i] =
-        m_num_imul_acesses[CURRENT_STAT_IDX][i];
-    m_num_imul24_acesses[PREV_STAT_IDX][i] =
-        m_num_imul24_acesses[CURRENT_STAT_IDX][i];
-    m_num_imul32_acesses[PREV_STAT_IDX][i] =
-        m_num_imul32_acesses[CURRENT_STAT_IDX][i];
-    m_num_fpmul_acesses[PREV_STAT_IDX][i] =
-        m_num_fpmul_acesses[CURRENT_STAT_IDX][i];
-    m_num_idiv_acesses[PREV_STAT_IDX][i] =
-        m_num_idiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_fpdiv_acesses[PREV_STAT_IDX][i] =
-        m_num_fpdiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_sp_acesses[PREV_STAT_IDX][i] = m_num_sp_acesses[CURRENT_STAT_IDX][i];
-    m_num_sfu_acesses[PREV_STAT_IDX][i] =
-        m_num_sfu_acesses[CURRENT_STAT_IDX][i];
-    m_num_trans_acesses[PREV_STAT_IDX][i] =
-        m_num_trans_acesses[CURRENT_STAT_IDX][i];
-    m_num_mem_acesses[PREV_STAT_IDX][i] =
-        m_num_mem_acesses[CURRENT_STAT_IDX][i];
-    m_num_sp_committed[PREV_STAT_IDX][i] =
-        m_num_sp_committed[CURRENT_STAT_IDX][i];
-    m_num_sfu_committed[PREV_STAT_IDX][i] =
-        m_num_sfu_committed[CURRENT_STAT_IDX][i];
-    m_num_mem_committed[PREV_STAT_IDX][i] =
-        m_num_mem_committed[CURRENT_STAT_IDX][i];
-    m_read_regfile_acesses[PREV_STAT_IDX][i] =
-        m_read_regfile_acesses[CURRENT_STAT_IDX][i];
-    m_write_regfile_acesses[PREV_STAT_IDX][i] =
-        m_write_regfile_acesses[CURRENT_STAT_IDX][i];
-    m_non_rf_operands[PREV_STAT_IDX][i] =
-        m_non_rf_operands[CURRENT_STAT_IDX][i];
-    m_active_sp_lanes[PREV_STAT_IDX][i] =
-        m_active_sp_lanes[CURRENT_STAT_IDX][i];
-    m_active_sfu_lanes[PREV_STAT_IDX][i] =
-        m_active_sfu_lanes[CURRENT_STAT_IDX][i];
+    m_pipeline_duty_cycle[PREV_STAT_IDX][i]=m_pipeline_duty_cycle[CURRENT_STAT_IDX][i];
+    m_num_decoded_insn[PREV_STAT_IDX][i]= m_num_decoded_insn[CURRENT_STAT_IDX][i];
+    m_num_FPdecoded_insn[PREV_STAT_IDX][i]=m_num_FPdecoded_insn[CURRENT_STAT_IDX][i];
+    m_num_INTdecoded_insn[PREV_STAT_IDX][i]=m_num_INTdecoded_insn[CURRENT_STAT_IDX][i];
+    m_num_storequeued_insn[PREV_STAT_IDX][i]=m_num_storequeued_insn[CURRENT_STAT_IDX][i];
+    m_num_loadqueued_insn[PREV_STAT_IDX][i]=m_num_loadqueued_insn[CURRENT_STAT_IDX][i];
+    m_num_ialu_acesses[PREV_STAT_IDX][i]=m_num_ialu_acesses[CURRENT_STAT_IDX][i];
+    m_num_fp_acesses[PREV_STAT_IDX][i]=m_num_fp_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_inst[PREV_STAT_IDX][i]=m_num_tex_inst[CURRENT_STAT_IDX][i];
+    m_num_imul_acesses[PREV_STAT_IDX][i]=m_num_imul_acesses[CURRENT_STAT_IDX][i];
+    m_num_imul24_acesses[PREV_STAT_IDX][i]=m_num_imul24_acesses[CURRENT_STAT_IDX][i];
+    m_num_imul32_acesses[PREV_STAT_IDX][i]=m_num_imul32_acesses[CURRENT_STAT_IDX][i];
+    m_num_fpmul_acesses[PREV_STAT_IDX][i]=m_num_fpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_idiv_acesses[PREV_STAT_IDX][i]=m_num_idiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_fpdiv_acesses[PREV_STAT_IDX][i]=m_num_fpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_sp_acesses[PREV_STAT_IDX][i]=m_num_sp_acesses[CURRENT_STAT_IDX][i];
+    m_num_sfu_acesses[PREV_STAT_IDX][i]=m_num_sfu_acesses[CURRENT_STAT_IDX][i];
+    m_num_sqrt_acesses[PREV_STAT_IDX][i]=m_num_sqrt_acesses[CURRENT_STAT_IDX][i];
+    m_num_log_acesses[PREV_STAT_IDX][i]=m_num_log_acesses[CURRENT_STAT_IDX][i];
+    m_num_sin_acesses[PREV_STAT_IDX][i]=m_num_sin_acesses[CURRENT_STAT_IDX][i];
+    m_num_exp_acesses[PREV_STAT_IDX][i]=m_num_exp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dp_acesses[PREV_STAT_IDX][i]=m_num_dp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpmul_acesses[PREV_STAT_IDX][i]=m_num_dpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpdiv_acesses[PREV_STAT_IDX][i]=m_num_dpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_tensor_core_acesses[PREV_STAT_IDX][i]=m_num_tensor_core_acesses[CURRENT_STAT_IDX][i];
+    m_num_const_acesses[PREV_STAT_IDX][i]=m_num_const_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_acesses[PREV_STAT_IDX][i]=m_num_tex_acesses[CURRENT_STAT_IDX][i];
+    m_num_mem_acesses[PREV_STAT_IDX][i]=m_num_mem_acesses[CURRENT_STAT_IDX][i];
+    m_num_sp_committed[PREV_STAT_IDX][i]=m_num_sp_committed[CURRENT_STAT_IDX][i];
+    m_num_sfu_committed[PREV_STAT_IDX][i]=m_num_sfu_committed[CURRENT_STAT_IDX][i];
+    m_num_mem_committed[PREV_STAT_IDX][i]=m_num_mem_committed[CURRENT_STAT_IDX][i];
+    m_read_regfile_acesses[PREV_STAT_IDX][i]=m_read_regfile_acesses[CURRENT_STAT_IDX][i];
+    m_write_regfile_acesses[PREV_STAT_IDX][i]=m_write_regfile_acesses[CURRENT_STAT_IDX][i];
+    m_non_rf_operands[PREV_STAT_IDX][i]=m_non_rf_operands[CURRENT_STAT_IDX][i];
+    m_active_sp_lanes[PREV_STAT_IDX][i]=m_active_sp_lanes[CURRENT_STAT_IDX][i];
+    m_active_sfu_lanes[PREV_STAT_IDX][i]=m_active_sfu_lanes[CURRENT_STAT_IDX][i];
+    m_active_exu_threads[PREV_STAT_IDX][i]=m_active_exu_threads[CURRENT_STAT_IDX][i];
+    m_active_exu_warps[PREV_STAT_IDX][i]=m_active_exu_warps[CURRENT_STAT_IDX][i];
   }
 }
 
@@ -356,6 +380,51 @@ power_stat_t::power_stat_t(const shader_core_config *shader_config,
   m_active_sms = active_sms;
   m_config = shader_config;
   m_mem_config = mem_config;
+  l1r_hits_kernel = 0;
+  l1r_misses_kernel = 0;
+  l1w_hits_kernel = 0;
+  l1w_misses_kernel = 0;
+  shared_accesses_kernel = 0;
+  cc_accesses_kernel = 0;
+  dram_rd_kernel = 0;
+  dram_wr_kernel = 0;
+  dram_pre_kernel = 0;
+  l1i_hits_kernel =0;
+  l1i_misses_kernel =0;
+  l2r_hits_kernel =0;
+  l2r_misses_kernel =0;
+  l2w_hits_kernel =0;
+  l2w_misses_kernel =0;
+  noc_tr_kernel = 0;
+  noc_rc_kernel = 0;
+
+  tot_inst_execution = 0;
+  tot_int_inst_execution = 0;
+  tot_fp_inst_execution = 0;
+  commited_inst_execution = 0;
+  ialu_acc_execution = 0;
+  imul24_acc_execution = 0;
+  imul32_acc_execution = 0;
+  imul_acc_execution = 0;
+  idiv_acc_execution = 0;
+  dp_acc_execution = 0;
+  dpmul_acc_execution = 0;
+  dpdiv_acc_execution = 0;
+  fp_acc_execution = 0;
+  fpmul_acc_execution = 0;
+  fpdiv_acc_execution = 0;
+  sqrt_acc_execution = 0;
+  log_acc_execution = 0;
+  sin_acc_execution = 0;
+  exp_acc_execution = 0;
+  tensor_acc_execution = 0;
+  tex_acc_execution = 0;
+  tot_fpu_acc_execution = 0;
+  tot_sfu_acc_execution = 0;
+  tot_threads_acc_execution = 0;
+  tot_warps_acc_execution = 0;
+  sp_active_lanes_execution = 0;
+  sfu_active_lanes_execution = 0;
 }
 
 void power_stat_t::visualizer_print(gzFile visualizer_file) {
diff --git a/src/gpgpu-sim/power_stat.h b/src/gpgpu-sim/power_stat.h
index c469db3b3..e2c3ed5cc 100644
--- a/src/gpgpu-sim/power_stat.h
+++ b/src/gpgpu-sim/power_stat.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -51,29 +52,40 @@ struct shader_core_power_stats_pod {
   unsigned
       *m_num_INTdecoded_insn[NUM_STAT_IDX];  // number of instructions committed
                                              // by this shader core
-  unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
-  unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
-  unsigned *m_num_ialu_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fp_acesses[NUM_STAT_IDX];
-  unsigned *m_num_tex_inst[NUM_STAT_IDX];
-  unsigned *m_num_imul_acesses[NUM_STAT_IDX];
-  unsigned *m_num_imul32_acesses[NUM_STAT_IDX];
-  unsigned *m_num_imul24_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fpmul_acesses[NUM_STAT_IDX];
-  unsigned *m_num_idiv_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fpdiv_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sp_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sfu_acesses[NUM_STAT_IDX];
-  unsigned *m_num_trans_acesses[NUM_STAT_IDX];
-  unsigned *m_num_mem_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sp_committed[NUM_STAT_IDX];
-  unsigned *m_num_sfu_committed[NUM_STAT_IDX];
-  unsigned *m_num_mem_committed[NUM_STAT_IDX];
-  unsigned *m_active_sp_lanes[NUM_STAT_IDX];
-  unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
-  unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
-  unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
-  unsigned *m_non_rf_operands[NUM_STAT_IDX];
+    unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
+    unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
+    unsigned *m_num_tex_inst[NUM_STAT_IDX];
+    double *m_num_ialu_acesses[NUM_STAT_IDX];
+    double *m_num_fp_acesses[NUM_STAT_IDX];
+    double *m_num_imul_acesses[NUM_STAT_IDX];
+    double *m_num_imul32_acesses[NUM_STAT_IDX];
+    double *m_num_imul24_acesses[NUM_STAT_IDX];
+    double *m_num_fpmul_acesses[NUM_STAT_IDX];
+    double *m_num_idiv_acesses[NUM_STAT_IDX];
+    double *m_num_fpdiv_acesses[NUM_STAT_IDX];
+    double *m_num_dp_acesses[NUM_STAT_IDX];
+    double *m_num_dpmul_acesses[NUM_STAT_IDX];
+    double *m_num_dpdiv_acesses[NUM_STAT_IDX];
+    double *m_num_sp_acesses[NUM_STAT_IDX];
+    double *m_num_sfu_acesses[NUM_STAT_IDX];
+    double *m_num_sqrt_acesses[NUM_STAT_IDX];
+    double *m_num_log_acesses[NUM_STAT_IDX];
+    double *m_num_sin_acesses[NUM_STAT_IDX];
+    double *m_num_exp_acesses[NUM_STAT_IDX];
+    double *m_num_tensor_core_acesses[NUM_STAT_IDX];
+    double *m_num_const_acesses[NUM_STAT_IDX];
+    double *m_num_tex_acesses[NUM_STAT_IDX];
+    double *m_num_mem_acesses[NUM_STAT_IDX];
+    unsigned *m_num_sp_committed[NUM_STAT_IDX];
+    unsigned *m_num_sfu_committed[NUM_STAT_IDX];
+    unsigned *m_num_mem_committed[NUM_STAT_IDX];
+    unsigned *m_active_sp_lanes[NUM_STAT_IDX];
+    unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
+    double *m_active_exu_threads[NUM_STAT_IDX];
+    double *m_active_exu_warps[NUM_STAT_IDX];    
+    unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
+    unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
+    unsigned *m_non_rf_operands[NUM_STAT_IDX];
 };
 
 class power_core_stat_t : public shader_core_power_stats_pod {
@@ -84,6 +96,7 @@ class power_core_stat_t : public shader_core_power_stats_pod {
   void print(FILE *fout);
   void init();
   void save_stats();
+ 
 
  private:
   shader_core_stats *m_core_stats;
@@ -96,8 +109,7 @@ struct mem_power_stats_pod {
   class cache_stats core_cache_stats[NUM_STAT_IDX];  // Total core stats
   class cache_stats l2_cache_stats[NUM_STAT_IDX];    // Total L2 partition stats
 
-  unsigned *shmem_read_access[NUM_STAT_IDX];  // Shared memory access
-
+  unsigned *shmem_access[NUM_STAT_IDX];  // Shared memory access
   // Low level DRAM stats
   unsigned *n_cmd[NUM_STAT_IDX];
   unsigned *n_activity[NUM_STAT_IDX];
@@ -106,6 +118,7 @@ struct mem_power_stats_pod {
   unsigned *n_pre[NUM_STAT_IDX];
   unsigned *n_rd[NUM_STAT_IDX];
   unsigned *n_wr[NUM_STAT_IDX];
+  unsigned *n_wr_WB[NUM_STAT_IDX];
   unsigned *n_req[NUM_STAT_IDX];
 
   // Interconnect stats
@@ -144,34 +157,88 @@ class power_stat_t {
     *m_average_pipeline_duty_cycle = 0;
     *m_active_sms = 0;
   }
-
-  unsigned get_total_inst() {
-    unsigned total_inst = 0;
+  void clear();
+  unsigned l1i_misses_kernel;
+  unsigned l1i_hits_kernel;
+  unsigned long long l1r_hits_kernel;
+  unsigned long long l1r_misses_kernel;
+  unsigned long long l1w_hits_kernel;
+  unsigned long long l1w_misses_kernel;
+  unsigned long long shared_accesses_kernel;
+  unsigned long long cc_accesses_kernel;
+  unsigned long long dram_rd_kernel;
+  unsigned long long dram_wr_kernel;
+  unsigned long long dram_pre_kernel;
+  unsigned long long l2r_hits_kernel;
+  unsigned long long l2r_misses_kernel;
+  unsigned long long l2w_hits_kernel;
+  unsigned long long l2w_misses_kernel;
+  unsigned long long noc_tr_kernel;
+  unsigned long long noc_rc_kernel;
+  unsigned long long tot_inst_execution;
+  unsigned long long tot_int_inst_execution;
+  unsigned long long tot_fp_inst_execution;
+  unsigned long long commited_inst_execution;
+  unsigned long long ialu_acc_execution;
+  unsigned long long imul24_acc_execution;
+  unsigned long long imul32_acc_execution;
+  unsigned long long imul_acc_execution;
+  unsigned long long idiv_acc_execution;
+  unsigned long long dp_acc_execution;
+  unsigned long long dpmul_acc_execution;
+  unsigned long long dpdiv_acc_execution;
+  unsigned long long fp_acc_execution;
+  unsigned long long fpmul_acc_execution;
+  unsigned long long fpdiv_acc_execution;
+  unsigned long long sqrt_acc_execution;
+  unsigned long long log_acc_execution;
+  unsigned long long sin_acc_execution;
+  unsigned long long exp_acc_execution;
+  unsigned long long tensor_acc_execution;
+  unsigned long long tex_acc_execution;
+  unsigned long long tot_fpu_acc_execution;
+  unsigned long long tot_sfu_acc_execution;
+  unsigned long long tot_threads_acc_execution;
+  unsigned long long tot_warps_acc_execution;
+  unsigned long long sp_active_lanes_execution;
+  unsigned long long sfu_active_lanes_execution;
+  double get_total_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_decoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_int_inst() {
-    unsigned total_inst = 0;
+  double get_total_int_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+          total_inst +=
+          (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+      else 
+        total_inst +=
           (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_num_INTdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_fp_inst() {
-    unsigned total_inst = 0;
+  double get_total_fp_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+      else 
+        total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_FPdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_load_inst() {
-    unsigned total_inst = 0;
+  double get_total_load_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst +=
           (pwr_core_stat->m_num_loadqueued_insn[CURRENT_STAT_IDX][i]) -
@@ -179,8 +246,8 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_total_store_inst() {
-    unsigned total_inst = 0;
+  double get_total_store_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst +=
           (pwr_core_stat->m_num_storequeued_insn[CURRENT_STAT_IDX][i]) -
@@ -188,34 +255,39 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_sp_committed_inst() {
-    unsigned total_inst = 0;
+  double get_sp_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sp_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_sfu_committed_inst() {
-    unsigned total_inst = 0;
+  double get_sfu_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_mem_committed_inst() {
-    unsigned total_inst = 0;
+  double get_mem_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_committed_inst() {
-    unsigned total_inst = 0;
+  double get_committed_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]) +
                     (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]) +
@@ -224,19 +296,27 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_regfile_reads() {
-    unsigned total_inst = 0;
+  double get_regfile_reads(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+         total_inst +=
+          (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
           (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_read_regfile_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_regfile_writes() {
-    unsigned total_inst = 0;
+  double get_regfile_writes(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+        total_inst +=
+          (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
           (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_write_regfile_acesses[PREV_STAT_IDX][i]);
     }
@@ -253,17 +333,20 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_non_regfile_operands() {
-    unsigned total_inst = 0;
+  double get_non_regfile_operands(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+         total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_non_rf_operands[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_sp_accessess() {
-    unsigned total_inst = 0;
+  double get_sp_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sp_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sp_acesses[PREV_STAT_IDX][i]);
@@ -271,25 +354,58 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_sfu_accessess() {
-    unsigned total_inst = 0;
+  double get_sfu_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sfu_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_trans_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_trans_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_trans_acesses[PREV_STAT_IDX][i]);
-    }
-    return total_inst;
+
+  double get_sqrt_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+          if(aggregate_stat)
+            total_inst+=(pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+          else
+            total_inst+=(pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_log_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst+=(pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]);
+        else 
+          total_inst+=(pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_sin_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)  
+          total_inst+=(pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+        else 
+          total_inst+=(pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_exp_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)  
+          total_inst+=(pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst+=(pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
   }
 
-  unsigned get_mem_accessess() {
-    unsigned total_inst = 0;
+  double get_mem_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_mem_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_acesses[PREV_STAT_IDX][i]);
@@ -297,66 +413,164 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_intdiv_accessess() {
-    unsigned total_inst = 0;
+  double get_intdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_fpdiv_accessess() {
-    unsigned total_inst = 0;
+  double get_fpdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul32_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul32_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul24_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul24_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
+  double get_intmul_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst+= (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]); 
+        else  
+          total_inst+= (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) - 
+                       (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+
+  double get_fpmul_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+        else
+          total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_fpmul_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
+  double get_fp_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  float get_sp_active_lanes() {
-    unsigned total_inst = 0;
+  double get_dp_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dpmul_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dpdiv_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_tensor_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_const_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i];
+        else
+          total_inst += (pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_const_acesses[PREV_STAT_IDX][i]);
+    }
+    return (total_inst);
+  }
+
+  double get_tex_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_sp_active_lanes() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_active_sp_lanes[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_active_sp_lanes[PREV_STAT_IDX][i]);
@@ -365,7 +579,7 @@ class power_stat_t {
   }
 
   float get_sfu_active_lanes() {
-    unsigned total_inst = 0;
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_active_sfu_lanes[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_active_sfu_lanes[PREV_STAT_IDX][i]);
@@ -375,49 +589,141 @@ class power_stat_t {
            m_config->gpgpu_num_sfu_units;
   }
 
-  unsigned get_tot_fpu_accessess() {
-    unsigned total_inst = 0;
+
+  float get_active_threads(bool aggregate_stat) {
+    unsigned total_threads = 0;
+    unsigned total_warps = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if(aggregate_stat){
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) ;
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      }
+      else{
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
+        }
+    }
+    if(total_warps != 0)
+      return (float)((float)total_threads / (float)total_warps);
+    else
+      return 0;
+  }
+
+  unsigned long long get_tot_threads_kernel(bool aggregate_stat) {
+    unsigned total_threads = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat){
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) ;
+      }
+      else{
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+        }
+    }
+
+      return total_threads;
+  }
+  unsigned long long get_tot_warps_kernel(bool aggregate_stat) {
+    unsigned long long total_warps = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if(aggregate_stat){
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      }
+      else{
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
+        }
+    }
+      return total_warps;
+  }
+
+
+  double get_tot_fpu_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
+                    (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
     }
-    total_inst +=
-        get_total_load_inst() + get_total_store_inst() + get_tex_inst();
+    //total_inst += get_total_load_inst()+get_total_store_inst()+get_tex_inst();
     return total_inst;
   }
 
-  unsigned get_tot_sfu_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+
+
+  double get_tot_sfu_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+        else
+            total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_trans_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_trans_acesses[PREV_STAT_IDX][i]);
+                    (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
+
     }
     return total_inst;
   }
 
-  unsigned get_ialu_accessess() {
-    unsigned total_inst = 0;
+  double get_ialu_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_ialu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_tex_inst() {
-    unsigned total_inst = 0;
+  double get_tex_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_tex_inst[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_tex_inst[PREV_STAT_IDX][i]);
@@ -425,7 +731,7 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_constant_c_accesses() {
+  double get_constant_c_accesses() {
     enum mem_access_type access_type[] = {CONST_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
@@ -440,7 +746,7 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_constant_c_misses() {
+  double get_constant_c_misses() {
     enum mem_access_type access_type[] = {CONST_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
@@ -455,10 +761,10 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_constant_c_hits() {
+  double get_constant_c_hits() {
     return (get_constant_c_accesses() - get_constant_c_misses());
   }
-  unsigned get_texture_c_accesses() {
+  double get_texture_c_accesses() {
     enum mem_access_type access_type[] = {TEXTURE_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
@@ -473,7 +779,7 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_texture_c_misses() {
+  double get_texture_c_misses() {
     enum mem_access_type access_type[] = {TEXTURE_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
@@ -488,205 +794,268 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_texture_c_hits() {
+  double get_texture_c_hits() {
     return (get_texture_c_accesses() - get_texture_c_misses());
   }
-  unsigned get_inst_c_accesses() {
+  double get_inst_c_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat)
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    else
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_inst_c_misses() {
+  double get_inst_c_misses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat)
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    else
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_inst_c_hits() {
-    return (get_inst_c_accesses() - get_inst_c_misses());
+  double get_inst_c_hits(bool aggregate_stat) {
+    return (get_inst_c_accesses(aggregate_stat) - get_inst_c_misses(aggregate_stat));
   }
 
-  unsigned get_l1d_read_accesses() {
+  double get_l1d_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
+  }
+  double get_l1d_read_misses(bool aggregate_stat) {
+    return (get_l1d_read_accesses(aggregate_stat) - get_l1d_read_hits(aggregate_stat));
   }
-  unsigned get_l1d_read_misses() {
+  double get_l1d_read_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, MSHR_HIT};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
   }
-  unsigned get_l1d_read_hits() {
-    return (get_l1d_read_accesses() - get_l1d_read_misses());
-  }
-  unsigned get_l1d_write_accesses() {
+  double get_l1d_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
+  }
+  double get_l1d_write_misses(bool aggregate_stat) {
+    return (get_l1d_write_accesses(aggregate_stat) - get_l1d_write_hits(aggregate_stat));
   }
-  unsigned get_l1d_write_misses() {
+  double get_l1d_write_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, MSHR_HIT};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
   }
-  unsigned get_l1d_write_hits() {
-    return (get_l1d_write_accesses() - get_l1d_write_misses());
-  }
-  unsigned get_cache_misses() {
-    return get_l1d_read_misses() + get_constant_c_misses() +
-           get_l1d_write_misses() + get_texture_c_misses();
+  double get_cache_misses() {
+    return get_l1d_read_misses(0) + get_constant_c_misses() +
+           get_l1d_write_misses(0) + get_texture_c_misses();
   }
 
-  unsigned get_cache_read_misses() {
-    return get_l1d_read_misses() + get_constant_c_misses() +
+  double get_cache_read_misses() {
+    return get_l1d_read_misses(0) + get_constant_c_misses() +
            get_texture_c_misses();
   }
 
-  unsigned get_cache_write_misses() { return get_l1d_write_misses(); }
+  double get_cache_write_misses() { return get_l1d_write_misses(0); }
 
-  unsigned get_shmem_read_access() {
+  double get_shmem_access(bool aggregate_stat) {
     unsigned total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_mem_stat->shmem_read_access[CURRENT_STAT_IDX][i]) -
-                    (pwr_mem_stat->shmem_read_access[PREV_STAT_IDX][i]);
+      if(aggregate_stat)
+        total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]) -
+                    (pwr_mem_stat->shmem_access[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_l2_read_accesses() {
+  unsigned long long  get_l2_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_read_misses() {
-    enum mem_access_type access_type[] = {
+  unsigned long long get_l2_read_misses(bool aggregate_stat) {
+    return (get_l2_read_accesses(aggregate_stat) - get_l2_read_hits(aggregate_stat));
+  }
+
+  unsigned long long get_l2_read_hits(bool aggregate_stat) {
+       enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] =  {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_read_hits() {
-    return (get_l2_read_accesses() - get_l2_read_misses());
-  }
-
-  unsigned get_l2_write_accesses() {
+  unsigned long long get_l2_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_write_misses() {
-    enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
+  unsigned long long get_l2_write_misses(bool aggregate_stat) {
+    return (get_l2_write_accesses(aggregate_stat) - get_l2_write_hits(aggregate_stat));
+  }
+  unsigned long long get_l2_write_hits(bool aggregate_stat) {
+        enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
-  unsigned get_l2_write_hits() {
-    return (get_l2_write_accesses() - get_l2_write_misses());
-  }
-  unsigned get_dram_cmd() {
+  double get_dram_cmd() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_cmd[CURRENT_STAT_IDX][i] -
@@ -694,7 +1063,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_activity() {
+  double get_dram_activity() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_activity[CURRENT_STAT_IDX][i] -
@@ -702,7 +1071,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_nop() {
+  double get_dram_nop() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_nop[CURRENT_STAT_IDX][i] -
@@ -710,7 +1079,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_act() {
+  double get_dram_act() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_act[CURRENT_STAT_IDX][i] -
@@ -718,31 +1087,49 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_pre() {
+  double get_dram_pre(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_pre[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_rd() {
+  double get_dram_rd(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_rd[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_wr() {
+  double get_dram_wr(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_wr[PREV_STAT_IDX][i]);
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] + 
+                pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] - 
+                pwr_mem_stat->n_wr[PREV_STAT_IDX][i]) +
+                (pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i] - 
+                pwr_mem_stat->n_wr_WB[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_req() {
+  double get_dram_req() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_req[CURRENT_STAT_IDX][i] -
@@ -751,20 +1138,31 @@ class power_stat_t {
     return total;
   }
 
-  long get_icnt_simt_to_mem() {
+  unsigned long long get_icnt_simt_to_mem(bool aggregate_stat) {
     long total = 0;
-    for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
+    for (unsigned i = 0; i < m_config->n_simt_clusters; ++i){
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_simt_to_mem[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
 
-  long get_icnt_mem_to_simt() {
+  unsigned long long get_icnt_mem_to_simt(bool aggregate_stat) {
     long total = 0;
     for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i];
+      }
+      
+      else{
+        total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_mem_to_simt[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index bcfda1867..c0161dd31 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// George L. Yuan, Andrew Turner, Inderpreet Singh
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// George L. Yuan, Andrew Turner, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -485,6 +486,10 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
   m_sid = shader_id;
   m_tpc = tpc_id;
 
+  if(get_gpu()->get_config().g_power_simulation_enabled){
+    scaling_coeffs =  get_gpu()->get_scaling_coeffs();
+  }
+
   m_last_inst_gpu_sim_cycle = 0;
   m_last_inst_gpu_tot_sim_cycle = 0;
 
@@ -888,7 +893,7 @@ void shader_core_ctx::decode() {
     m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
     if (pI1) {
       m_stats->m_num_decoded_insn[m_sid]++;
-      if (pI1->oprnd_type == INT_OP) {
+      if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP))  { //these counters get added up in mcPat to compute scheduler power
         m_stats->m_num_INTdecoded_insn[m_sid]++;
       } else if (pI1->oprnd_type == FP_OP) {
         m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -899,7 +904,7 @@ void shader_core_ctx::decode() {
         m_warp[m_inst_fetch_buffer.m_warp_id]->ibuffer_fill(1, pI2);
         m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
         m_stats->m_num_decoded_insn[m_sid]++;
-        if (pI2->oprnd_type == INT_OP) {
+        if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP))  { //these counters get added up in mcPat to compute scheduler power
           m_stats->m_num_INTdecoded_insn[m_sid]++;
         } else if (pI2->oprnd_type == FP_OP) {
           m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -982,8 +987,10 @@ void shader_core_ctx::fetch() {
               m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
           std::list<cache_event> events;
           enum cache_request_status status;
-          if (m_config->perfect_inst_const_cache)
+          if (m_config->perfect_inst_const_cache){
             status = HIT;
+            shader_cache_access_log(m_sid, INSTRUCTION, 0);
+          }
           else
             status = m_L1I->access(
                 (new_addr_type)ppc, mf,
@@ -2275,7 +2282,7 @@ void sp_unit::active_lanes_in_pipeline() {
 void dp_unit::active_lanes_in_pipeline() {
   unsigned active_count = pipelined_simd_unit::get_active_lanes_in_pipeline();
   assert(active_count <= m_core->get_config()->warp_size);
-  m_core->incspactivelanes_stat(active_count);
+  //m_core->incspactivelanes_stat(active_count);
   m_core->incfuactivelanes_stat(active_count);
   m_core->incfumemactivelanes_stat(active_count);
 }
@@ -3079,52 +3086,69 @@ void warp_inst_t::print(FILE *fout) const {
   m_config->gpgpu_ctx->func_sim->ptx_print_insn(pc, fout);
   fprintf(fout, "\n");
 }
-void shader_core_ctx::incexecstat(warp_inst_t *&inst) {
-  if (inst->mem_op == TEX) inctex_stat(inst->active_count(), 1);
-
-  // Latency numbers for next operations are used to scale the power values
-  // for special operations, according observations from microbenchmarking
-  // TODO: put these numbers in the xml configuration
-
-  switch (inst->sp_op) {
+void shader_core_ctx::incexecstat(warp_inst_t *&inst)
+{
+    // Latency numbers for next operations are used to scale the power values
+    // for special operations, according observations from microbenchmarking
+    // TODO: put these numbers in the xml configuration
+  if(get_gpu()->get_config().g_power_simulation_enabled){
+    switch(inst->sp_op){
     case INT__OP:
-      incialu_stat(inst->active_count(), 32);
+      incialu_stat(inst->active_count(), scaling_coeffs->int_coeff);
       break;
     case INT_MUL_OP:
-      incimul_stat(inst->active_count(), 7.2);
+      incimul_stat(inst->active_count(), scaling_coeffs->int_mul_coeff);
       break;
     case INT_MUL24_OP:
-      incimul24_stat(inst->active_count(), 4.2);
+      incimul24_stat(inst->active_count(), scaling_coeffs->int_mul24_coeff);
       break;
     case INT_MUL32_OP:
-      incimul32_stat(inst->active_count(), 4);
+      incimul32_stat(inst->active_count(), scaling_coeffs->int_mul32_coeff);
       break;
     case INT_DIV_OP:
-      incidiv_stat(inst->active_count(), 40);
+      incidiv_stat(inst->active_count(), scaling_coeffs->int_div_coeff);
       break;
     case FP__OP:
-      incfpalu_stat(inst->active_count(), 1);
+      incfpalu_stat(inst->active_count(),scaling_coeffs->fp_coeff);
       break;
     case FP_MUL_OP:
-      incfpmul_stat(inst->active_count(), 1.8);
+      incfpmul_stat(inst->active_count(), scaling_coeffs->fp_mul_coeff);
       break;
     case FP_DIV_OP:
-      incfpdiv_stat(inst->active_count(), 48);
+      incfpdiv_stat(inst->active_count(), scaling_coeffs->fp_div_coeff);
+      break;
+    case DP___OP:
+      incdpalu_stat(inst->active_count(), scaling_coeffs->dp_coeff);
+      break;
+    case DP_MUL_OP:
+      incdpmul_stat(inst->active_count(), scaling_coeffs->dp_mul_coeff);
+      break;
+    case DP_DIV_OP:
+      incdpdiv_stat(inst->active_count(), scaling_coeffs->dp_div_coeff);
       break;
     case FP_SQRT_OP:
-      inctrans_stat(inst->active_count(), 25);
+      incsqrt_stat(inst->active_count(), scaling_coeffs->sqrt_coeff);
       break;
     case FP_LG_OP:
-      inctrans_stat(inst->active_count(), 35);
+      inclog_stat(inst->active_count(), scaling_coeffs->log_coeff);
       break;
     case FP_SIN_OP:
-      inctrans_stat(inst->active_count(), 12);
+      incsin_stat(inst->active_count(), scaling_coeffs->sin_coeff);
       break;
     case FP_EXP_OP:
-      inctrans_stat(inst->active_count(), 35);
+      incexp_stat(inst->active_count(), scaling_coeffs->exp_coeff);
+      break;
+    case TENSOR__OP:
+      inctensor_stat(inst->active_count(), scaling_coeffs->tensor_coeff);
+      break;
+    case TEX__OP:
+      inctex_stat(inst->active_count(), scaling_coeffs->tex_coeff);
       break;
     default:
       break;
+    }
+    if(inst->const_cache_operand) //warp has const address space load as one operand
+      inc_const_accesses(1);
   }
 }
 void shader_core_ctx::print_stage(unsigned int stage, FILE *fout) const {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index f2fac1209..65d56251c 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
-// Ali Bakhoda
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
+// Ali Bakhoda, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -1709,18 +1710,26 @@ struct shader_core_stats_pod {
   unsigned *m_num_INTdecoded_insn;
   unsigned *m_num_storequeued_insn;
   unsigned *m_num_loadqueued_insn;
-  unsigned *m_num_ialu_acesses;
-  unsigned *m_num_fp_acesses;
-  unsigned *m_num_imul_acesses;
   unsigned *m_num_tex_inst;
-  unsigned *m_num_fpmul_acesses;
-  unsigned *m_num_idiv_acesses;
-  unsigned *m_num_fpdiv_acesses;
-  unsigned *m_num_sp_acesses;
-  unsigned *m_num_sfu_acesses;
-  unsigned *m_num_tensor_core_acesses;
-  unsigned *m_num_trans_acesses;
-  unsigned *m_num_mem_acesses;
+  double *m_num_ialu_acesses;
+  double *m_num_fp_acesses;
+  double *m_num_imul_acesses;
+  double *m_num_fpmul_acesses;
+  double *m_num_idiv_acesses;
+  double *m_num_fpdiv_acesses;
+  double *m_num_sp_acesses;
+  double *m_num_sfu_acesses;
+  double *m_num_tensor_core_acesses;
+  double *m_num_tex_acesses;
+  double *m_num_const_acesses;
+  double *m_num_dp_acesses;
+  double *m_num_dpmul_acesses;
+  double *m_num_dpdiv_acesses;
+  double *m_num_sqrt_acesses;
+  double *m_num_log_acesses;
+  double *m_num_sin_acesses;
+  double *m_num_exp_acesses;
+  double *m_num_mem_acesses;
   unsigned *m_num_sp_committed;
   unsigned *m_num_tlb_hits;
   unsigned *m_num_tlb_accesses;
@@ -1730,13 +1739,15 @@ struct shader_core_stats_pod {
   unsigned *m_read_regfile_acesses;
   unsigned *m_write_regfile_acesses;
   unsigned *m_non_rf_operands;
-  unsigned *m_num_imul24_acesses;
-  unsigned *m_num_imul32_acesses;
+  double *m_num_imul24_acesses;
+  double *m_num_imul32_acesses;
   unsigned *m_active_sp_lanes;
   unsigned *m_active_sfu_lanes;
   unsigned *m_active_tensor_core_lanes;
   unsigned *m_active_fu_lanes;
   unsigned *m_active_fu_mem_lanes;
+  double *m_active_exu_threads; //For power model
+  double *m_active_exu_warps; //For power model
   unsigned *m_n_diverge;  // number of divergence occurring in this shader
   unsigned gpgpu_n_load_insn;
   unsigned gpgpu_n_store_insn;
@@ -1807,38 +1818,56 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_loadqueued_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_tex_inst = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_INTdecoded_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_ialu_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fp_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tex_inst = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul24_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul32_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpmul_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_idiv_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpdiv_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_dp_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_dpmul_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_dpdiv_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
     m_num_sp_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_sfu_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tensor_core_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_trans_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tensor_core_acesses = 
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_const_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tex_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sqrt_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_log_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_sin_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_exp_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
     m_num_mem_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_sp_committed =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tlb_hits = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_tlb_hits = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_tlb_accesses =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_sp_lanes =
@@ -1849,6 +1878,10 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_fu_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_active_exu_threads =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_active_exu_warps =
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_active_fu_mem_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_sfu_committed =
@@ -1863,7 +1896,8 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_non_rf_operands =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_n_diverge = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_n_diverge = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     shader_cycle_distro =
         (unsigned *)calloc(config->warp_size + 3, sizeof(unsigned));
     last_shader_cycle_distro =
@@ -1892,6 +1926,48 @@ class shader_core_stats : public shader_core_stats_pod {
     delete m_incoming_traffic_stats;
     free(m_num_sim_insn);
     free(m_num_sim_winsn);
+    free(m_num_FPdecoded_insn);
+    free(m_num_INTdecoded_insn);
+    free(m_num_storequeued_insn);
+    free(m_num_loadqueued_insn);
+    free(m_num_ialu_acesses);
+    free(m_num_fp_acesses);
+    free(m_num_imul_acesses);
+    free(m_num_tex_inst);
+    free(m_num_fpmul_acesses);
+    free(m_num_idiv_acesses);
+    free(m_num_fpdiv_acesses);
+    free(m_num_sp_acesses);
+    free(m_num_sfu_acesses);
+    free(m_num_tensor_core_acesses);
+    free(m_num_tex_acesses);
+    free(m_num_const_acesses);
+    free(m_num_dp_acesses);
+    free(m_num_dpmul_acesses);
+    free(m_num_dpdiv_acesses);
+    free(m_num_sqrt_acesses);
+    free(m_num_log_acesses);
+    free(m_num_sin_acesses);
+    free(m_num_exp_acesses);
+    free(m_num_mem_acesses);
+    free(m_num_sp_committed);
+    free(m_num_tlb_hits);
+    free(m_num_tlb_accesses);
+    free(m_num_sfu_committed);
+    free(m_num_tensor_core_committed);
+    free(m_num_mem_committed);
+    free(m_read_regfile_acesses);
+    free(m_write_regfile_acesses);
+    free(m_non_rf_operands);
+    free(m_num_imul24_acesses);
+    free(m_num_imul32_acesses);
+    free(m_active_sp_lanes);
+    free(m_active_sfu_lanes);
+    free(m_active_tensor_core_lanes);
+    free(m_active_fu_lanes);
+    free(m_active_exu_threads);
+    free(m_active_exu_warps);
+    free(m_active_fu_mem_lanes);
     free(m_n_diverge);
     free(shader_cycle_distro);
     free(last_shader_cycle_distro);
@@ -1996,7 +2072,7 @@ class shader_core_ctx : public core_t {
     printf("GPGPU-Sim uArch: Shader %d bind to kernel %u \'%s\'\n", m_sid,
            m_kernel->get_uid(), m_kernel->name().c_str());
   }
-
+  PowerscalingCoefficients *scaling_coeffs;
   // accessors
   bool fetch_unit_response_buffer_full() const;
   bool ldst_unit_response_buffer_full() const;
@@ -2054,119 +2130,206 @@ class shader_core_ctx : public core_t {
 
   void incload_stat() { m_stats->m_num_loadqueued_insn[m_sid]++; }
   void incstore_stat() { m_stats->m_num_storequeued_insn[m_sid]++; }
-  void incialu_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_ialu_acesses[m_sid] =
-          m_stats->m_num_ialu_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_ialu_acesses[m_sid] =
-          m_stats->m_num_ialu_acesses[m_sid] + active_count * latency;
+  void incialu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_ialu_acesses[m_sid]=m_stats->m_num_ialu_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_ialu_acesses[m_sid]=m_stats->m_num_ialu_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void inctex_stat(unsigned active_count, double latency) {
-    m_stats->m_num_tex_inst[m_sid] =
-        m_stats->m_num_tex_inst[m_sid] + active_count * latency;
-  }
-  void incimul_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul_acesses[m_sid] =
-          m_stats->m_num_imul_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul_acesses[m_sid] =
-          m_stats->m_num_imul_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incimul_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_imul_acesses[m_sid]=m_stats->m_num_imul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_imul_acesses[m_sid]=m_stats->m_num_imul_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incimul24_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul24_acesses[m_sid] =
-          m_stats->m_num_imul24_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul24_acesses[m_sid] =
-          m_stats->m_num_imul24_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incimul24_stat(unsigned active_count,double latency) {
+  if(m_config->gpgpu_clock_gated_lanes==false){
+    m_stats->m_num_imul24_acesses[m_sid]=m_stats->m_num_imul24_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_imul24_acesses[m_sid]=m_stats->m_num_imul24_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incimul32_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul32_acesses[m_sid] =
-          m_stats->m_num_imul32_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul32_acesses[m_sid] =
-          m_stats->m_num_imul32_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;    
+   }
+   void incimul32_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_imul32_acesses[m_sid]=m_stats->m_num_imul32_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_sfu(active_count, latency);          
+    }else{
+      m_stats->m_num_imul32_acesses[m_sid]=m_stats->m_num_imul32_acesses[m_sid]+(double)active_count*latency;
     }
-    // printf("Int_Mul -- Active_count: %d\n",active_count);
-  }
-  void incidiv_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_idiv_acesses[m_sid] =
-          m_stats->m_num_idiv_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_idiv_acesses[m_sid] =
-          m_stats->m_num_idiv_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+   void incidiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_idiv_acesses[m_sid]=m_stats->m_num_idiv_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_idiv_acesses[m_sid]=m_stats->m_num_idiv_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incfpalu_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fp_acesses[m_sid] =
-          m_stats->m_num_fp_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_fp_acesses[m_sid] =
-          m_stats->m_num_fp_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;    
+  }
+   void incfpalu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fp_acesses[m_sid]=m_stats->m_num_fp_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_fp_acesses[m_sid]=m_stats->m_num_fp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;     
+  }
+   void incfpmul_stat(unsigned active_count,double latency) {
+              // printf("FP MUL stat increament\n");
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fpmul_acesses[m_sid]=m_stats->m_num_fpmul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_fpmul_acesses[m_sid]=m_stats->m_num_fpmul_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incfpdiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fpdiv_acesses[m_sid]=m_stats->m_num_fpdiv_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_fpdiv_acesses[m_sid]=m_stats->m_num_fpdiv_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incdpalu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dp_acesses[m_sid]=m_stats->m_num_dp_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_dp_acesses[m_sid]=m_stats->m_num_dp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++; 
+   }
+   void incdpmul_stat(unsigned active_count,double latency) {
+              // printf("FP MUL stat increament\n");
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dpmul_acesses[m_sid]=m_stats->m_num_dpmul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_dpmul_acesses[m_sid]=m_stats->m_num_dpmul_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incdpdiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dpdiv_acesses[m_sid]=m_stats->m_num_dpdiv_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_dpdiv_acesses[m_sid]=m_stats->m_num_dpdiv_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void incsqrt_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_sqrt_acesses[m_sid]=m_stats->m_num_sqrt_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_sqrt_acesses[m_sid]=m_stats->m_num_sqrt_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void inclog_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_log_acesses[m_sid]=m_stats->m_num_log_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_log_acesses[m_sid]=m_stats->m_num_log_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void incexp_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_exp_acesses[m_sid]=m_stats->m_num_exp_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_exp_acesses[m_sid]=m_stats->m_num_exp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void incfpmul_stat(unsigned active_count, double latency) {
-    // printf("FP MUL stat increament\n");
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fpmul_acesses[m_sid] =
-          m_stats->m_num_fpmul_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_fpmul_acesses[m_sid] =
-          m_stats->m_num_fpmul_acesses[m_sid] + active_count * latency;
+
+   void incsin_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_sin_acesses[m_sid]=m_stats->m_num_sin_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_sin_acesses[m_sid]=m_stats->m_num_sin_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void incfpdiv_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fpdiv_acesses[m_sid] =
-          m_stats->m_num_fpdiv_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_fpdiv_acesses[m_sid] =
-          m_stats->m_num_fpdiv_acesses[m_sid] + active_count * latency;
+
+
+   void inctensor_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_tensor_core_acesses[m_sid]=m_stats->m_num_tensor_core_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_tensor_core_acesses[m_sid]=m_stats->m_num_tensor_core_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void inctrans_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_trans_acesses[m_sid] =
-          m_stats->m_num_trans_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_trans_acesses[m_sid] =
-          m_stats->m_num_trans_acesses[m_sid] + active_count * latency;
+
+  void inctex_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_tex_acesses[m_sid]=m_stats->m_num_tex_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_tex_acesses[m_sid]=m_stats->m_num_tex_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void inc_const_accesses(unsigned active_count) {
+    m_stats->m_num_const_acesses[m_sid]=m_stats->m_num_const_acesses[m_sid]+active_count;
   }
 
   void incsfu_stat(unsigned active_count, double latency) {
     m_stats->m_num_sfu_acesses[m_sid] =
-        m_stats->m_num_sfu_acesses[m_sid] + active_count * latency;
+        m_stats->m_num_sfu_acesses[m_sid] + (double)active_count*latency;
   }
   void incsp_stat(unsigned active_count, double latency) {
     m_stats->m_num_sp_acesses[m_sid] =
-        m_stats->m_num_sp_acesses[m_sid] + active_count * latency;
+        m_stats->m_num_sp_acesses[m_sid] + (double)active_count*latency;
   }
   void incmem_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count*latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count*latency;
     }
   }
   void incexecstat(warp_inst_t *&inst);
diff --git a/src/gpgpu-sim/stat-tool.cc b/src/gpgpu-sim/stat-tool.cc
index 6fafaa6af..0513d17ed 100644
--- a/src/gpgpu-sim/stat-tool.cc
+++ b/src/gpgpu-sim/stat-tool.cc
@@ -369,8 +369,6 @@ void shader_mem_lat_print(FILE *fout) {
 static int s_cache_access_logger_n_types = 0;
 static std::vector<linear_histogram_logger> s_cache_access_logger;
 
-enum cache_access_logger_types { NORMALS, TEXTURE, CONSTANT, INSTRUCTION };
-
 int get_shader_normal_cache_id() { return NORMALS; }
 int get_shader_texture_cache_id() { return TEXTURE; }
 int get_shader_constant_cache_id() { return CONSTANT; }
diff --git a/src/gpgpu-sim/stat-tool.h b/src/gpgpu-sim/stat-tool.h
index 3a291be3a..fdf875600 100644
--- a/src/gpgpu-sim/stat-tool.h
+++ b/src/gpgpu-sim/stat-tool.h
@@ -268,6 +268,8 @@ class linear_histogram_logger : public snap_shot_trigger,
   static int s_ids;
 };
 
+enum cache_access_logger_types { NORMALS, TEXTURE, CONSTANT, INSTRUCTION };
+
 void try_snap_shot(unsigned long long current_cycle);
 void set_spill_interval(unsigned long long interval);
 void spill_log_to_file(FILE *fout, int final, unsigned long long current_cycle);
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.cc b/src/gpuwattch/gpgpu_sim_wrapper.cc
deleted file mode 100644
index f2989f630..000000000
--- a/src/gpuwattch/gpgpu_sim_wrapper.cc
+++ /dev/null
@@ -1,863 +0,0 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
-// The University of British Columbia
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-
-#include "gpgpu_sim_wrapper.h"
-#include <sys/stat.h>
-#define SP_BASE_POWER 0
-#define SFU_BASE_POWER 0
-
-static const char* pwr_cmp_label[] = {
-    "IBP,", "ICP,",  "DCP,",   "TCP,",   "CCP,",        "SHRDP,",
-    "RFP,", "SPP,",  "SFUP,",  "FPUP,",  "SCHEDP,",     "L2CP,",
-    "MCP,", "NOCP,", "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONST_DYNAMICP"};
-
-enum pwr_cmp_t {
-  IBP = 0,
-  ICP,
-  DCP,
-  TCP,
-  CCP,
-  SHRDP,
-  RFP,
-  SPP,
-  SFUP,
-  FPUP,
-  SCHEDP,
-  L2CP,
-  MCP,
-  NOCP,
-  DRAMP,
-  PIPEP,
-  IDLE_COREP,
-  CONST_DYNAMICP,
-  NUM_COMPONENTS_MODELLED
-};
-
-gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
-                                     char* xmlfile) {
-  kernel_sample_count = 0;
-  total_sample_count = 0;
-
-  kernel_tot_power = 0;
-
-  num_pwr_cmps = NUM_COMPONENTS_MODELLED;
-  num_perf_counters = NUM_PERFORMANCE_COUNTERS;
-
-  // Initialize per-component counter/power vectors
-  avg_max_min_counters<double> init;
-  kernel_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, init);
-  kernel_cmp_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, init);
-
-  kernel_power = init;   // Per-kernel powers
-  gpu_tot_power = init;  // Global powers
-
-  sample_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, 0);
-
-  sample_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, 0);
-  initpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
-  effpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
-
-  const_dynamic_power = 0;
-  proc_power = 0;
-
-  g_power_filename = NULL;
-  g_power_trace_filename = NULL;
-  g_metric_trace_filename = NULL;
-  g_steady_state_tracking_filename = NULL;
-  xml_filename = xmlfile;
-  g_power_simulation_enabled = power_simulation_enabled;
-  g_power_trace_enabled = false;
-  g_steady_power_levels_enabled = false;
-  g_power_trace_zlevel = 0;
-  g_power_per_cycle_dump = false;
-  gpu_steady_power_deviation = 0;
-  gpu_steady_min_period = 0;
-
-  gpu_stat_sample_freq = 0;
-  p = new ParseXML();
-  if (g_power_simulation_enabled) {
-    p->parse(xml_filename);
-  }
-  proc = new Processor(p);
-  power_trace_file = NULL;
-  metric_trace_file = NULL;
-  steady_state_tacking_file = NULL;
-  has_written_avg = false;
-  init_inst_val = false;
-}
-
-gpgpu_sim_wrapper::~gpgpu_sim_wrapper() {}
-
-bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
-  if (b == 0)
-    return (abs(a - b) < 0.00001);
-  else
-    return (abs(a - b) / abs(b) < 0.00001);
-
-  return false;
-}
-void gpgpu_sim_wrapper::init_mcpat(
-    char* xmlfile, char* powerfilename, char* power_trace_filename,
-    char* metric_trace_filename, char* steady_state_filename,
-    bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
-    bool power_per_cycle_dump, double steady_power_deviation,
-    double steady_min_period, int zlevel, double init_val,
-    int stat_sample_freq) {
-  // Write File Headers for (-metrics trace, -power trace)
-
-  reset_counters();
-  static bool mcpat_init = true;
-
-  // initialize file name if it is not set
-  time_t curr_time;
-  time(&curr_time);
-  char* date = ctime(&curr_time);
-  char* s = date;
-  while (*s) {
-    if (*s == ' ' || *s == '\t' || *s == ':') *s = '-';
-    if (*s == '\n' || *s == '\r') *s = 0;
-    s++;
-  }
-
-  if (mcpat_init) {
-    g_power_filename = powerfilename;
-    g_power_trace_filename = power_trace_filename;
-    g_metric_trace_filename = metric_trace_filename;
-    g_steady_state_tracking_filename = steady_state_filename;
-    xml_filename = xmlfile;
-    g_power_simulation_enabled = power_sim_enabled;
-    g_power_trace_enabled = trace_enabled;
-    g_steady_power_levels_enabled = steady_state_enabled;
-    g_power_trace_zlevel = zlevel;
-    g_power_per_cycle_dump = power_per_cycle_dump;
-    gpu_steady_power_deviation = steady_power_deviation;
-    gpu_steady_min_period = steady_min_period;
-
-    gpu_stat_sample_freq = stat_sample_freq;
-
-    // p->sys.total_cycles=gpu_stat_sample_freq*4;
-    p->sys.total_cycles = gpu_stat_sample_freq;
-    power_trace_file = NULL;
-    metric_trace_file = NULL;
-    steady_state_tacking_file = NULL;
-
-    if (g_power_trace_enabled) {
-      power_trace_file = gzopen(g_power_trace_filename, "w");
-      metric_trace_file = gzopen(g_metric_trace_filename, "w");
-      if ((power_trace_file == NULL) || (metric_trace_file == NULL)) {
-        printf("error - could not open trace files \n");
-        exit(1);
-      }
-      gzsetparams(power_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
-
-      gzprintf(power_trace_file, "power,");
-      for (unsigned i = 0; i < num_pwr_cmps; i++) {
-        gzprintf(power_trace_file, pwr_cmp_label[i]);
-      }
-      gzprintf(power_trace_file, "\n");
-
-      gzsetparams(metric_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
-      for (unsigned i = 0; i < num_perf_counters; i++) {
-        gzprintf(metric_trace_file, perf_count_label[i]);
-      }
-      gzprintf(metric_trace_file, "\n");
-
-      gzclose(power_trace_file);
-      gzclose(metric_trace_file);
-    }
-    if (g_steady_power_levels_enabled) {
-      steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "w");
-      if ((steady_state_tacking_file == NULL)) {
-        printf("error - could not open trace files \n");
-        exit(1);
-      }
-      gzsetparams(steady_state_tacking_file, g_power_trace_zlevel,
-                  Z_DEFAULT_STRATEGY);
-      gzprintf(steady_state_tacking_file, "start,end,power,IPC,");
-      for (unsigned i = 0; i < num_perf_counters; i++) {
-        gzprintf(steady_state_tacking_file, perf_count_label[i]);
-      }
-      gzprintf(steady_state_tacking_file, "\n");
-
-      gzclose(steady_state_tacking_file);
-    }
-
-    mcpat_init = false;
-    has_written_avg = false;
-    powerfile.open(g_power_filename);
-    int flg = chmod(g_power_filename, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
-    assert(flg == 0);
-  }
-  sample_val = 0;
-  init_inst_val = init_val;  // gpu_tot_sim_insn+gpu_sim_insn;
-}
-
-void gpgpu_sim_wrapper::reset_counters() {
-  avg_max_min_counters<double> init;
-  for (unsigned i = 0; i < num_perf_counters; ++i) {
-    sample_perf_counters[i] = 0;
-    kernel_cmp_perf_counters[i] = init;
-  }
-  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-    sample_cmp_pwr[i] = 0;
-    kernel_cmp_pwr[i] = init;
-  }
-
-  // Reset per-kernel counters
-  kernel_sample_count = 0;
-  kernel_tot_power = 0;
-  kernel_power = init;
-
-  return;
-}
-
-void gpgpu_sim_wrapper::set_inst_power(bool clk_gated_lanes, double tot_cycles,
-                                       double busy_cycles, double tot_inst,
-                                       double int_inst, double fp_inst,
-                                       double load_inst, double store_inst,
-                                       double committed_inst) {
-  p->sys.core[0].gpgpu_clock_gated_lanes = clk_gated_lanes;
-  p->sys.core[0].total_cycles = tot_cycles;
-  p->sys.core[0].busy_cycles = busy_cycles;
-  p->sys.core[0].total_instructions =
-      tot_inst * p->sys.scaling_coefficients[TOT_INST];
-  p->sys.core[0].int_instructions =
-      int_inst * p->sys.scaling_coefficients[FP_INT];
-  p->sys.core[0].fp_instructions =
-      fp_inst * p->sys.scaling_coefficients[FP_INT];
-  p->sys.core[0].load_instructions = load_inst;
-  p->sys.core[0].store_instructions = store_inst;
-  p->sys.core[0].committed_instructions = committed_inst;
-  sample_perf_counters[FP_INT] = int_inst + fp_inst;
-  sample_perf_counters[TOT_INST] = tot_inst;
-}
-
-void gpgpu_sim_wrapper::set_regfile_power(double reads, double writes,
-                                          double ops) {
-  p->sys.core[0].int_regfile_reads =
-      reads * p->sys.scaling_coefficients[REG_RD];
-  p->sys.core[0].int_regfile_writes =
-      writes * p->sys.scaling_coefficients[REG_WR];
-  p->sys.core[0].non_rf_operands =
-      ops * p->sys.scaling_coefficients[NON_REG_OPs];
-  sample_perf_counters[REG_RD] = reads;
-  sample_perf_counters[REG_WR] = writes;
-  sample_perf_counters[NON_REG_OPs] = ops;
-}
-
-void gpgpu_sim_wrapper::set_icache_power(double hits, double misses) {
-  p->sys.core[0].icache.read_accesses =
-      hits * p->sys.scaling_coefficients[IC_H] +
-      misses * p->sys.scaling_coefficients[IC_M];
-  p->sys.core[0].icache.read_misses =
-      misses * p->sys.scaling_coefficients[IC_M];
-  sample_perf_counters[IC_H] = hits;
-  sample_perf_counters[IC_M] = misses;
-}
-
-void gpgpu_sim_wrapper::set_ccache_power(double hits, double misses) {
-  p->sys.core[0].ccache.read_accesses =
-      hits * p->sys.scaling_coefficients[CC_H] +
-      misses * p->sys.scaling_coefficients[CC_M];
-  p->sys.core[0].ccache.read_misses =
-      misses * p->sys.scaling_coefficients[CC_M];
-  sample_perf_counters[CC_H] = hits;
-  sample_perf_counters[CC_M] = misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_tcache_power(double hits, double misses) {
-  p->sys.core[0].tcache.read_accesses =
-      hits * p->sys.scaling_coefficients[TC_H] +
-      misses * p->sys.scaling_coefficients[TC_M];
-  p->sys.core[0].tcache.read_misses =
-      misses * p->sys.scaling_coefficients[TC_M];
-  sample_perf_counters[TC_H] = hits;
-  sample_perf_counters[TC_M] = misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_shrd_mem_power(double accesses) {
-  p->sys.core[0].sharedmemory.read_accesses =
-      accesses * p->sys.scaling_coefficients[SHRD_ACC];
-  sample_perf_counters[SHRD_ACC] = accesses;
-}
-
-void gpgpu_sim_wrapper::set_l1cache_power(double read_hits, double read_misses,
-                                          double write_hits,
-                                          double write_misses) {
-  p->sys.core[0].dcache.read_accesses =
-      read_hits * p->sys.scaling_coefficients[DC_RH] +
-      read_misses * p->sys.scaling_coefficients[DC_RM];
-  p->sys.core[0].dcache.read_misses =
-      read_misses * p->sys.scaling_coefficients[DC_RM];
-  p->sys.core[0].dcache.write_accesses =
-      write_hits * p->sys.scaling_coefficients[DC_WH] +
-      write_misses * p->sys.scaling_coefficients[DC_WM];
-  p->sys.core[0].dcache.write_misses =
-      write_misses * p->sys.scaling_coefficients[DC_WM];
-  sample_perf_counters[DC_RH] = read_hits;
-  sample_perf_counters[DC_RM] = read_misses;
-  sample_perf_counters[DC_WH] = write_hits;
-  sample_perf_counters[DC_WM] = write_misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
-                                          double write_hits,
-                                          double write_misses) {
-  p->sys.l2.total_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
-                             read_misses * p->sys.scaling_coefficients[L2_RM] +
-                             write_hits * p->sys.scaling_coefficients[L2_WH] +
-                             write_misses * p->sys.scaling_coefficients[L2_WM];
-  p->sys.l2.read_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
-                            read_misses * p->sys.scaling_coefficients[L2_RM];
-  p->sys.l2.write_accesses = write_hits * p->sys.scaling_coefficients[L2_WH] +
-                             write_misses * p->sys.scaling_coefficients[L2_WM];
-  p->sys.l2.read_hits = read_hits * p->sys.scaling_coefficients[L2_RH];
-  p->sys.l2.read_misses = read_misses * p->sys.scaling_coefficients[L2_RM];
-  p->sys.l2.write_hits = write_hits * p->sys.scaling_coefficients[L2_WH];
-  p->sys.l2.write_misses = write_misses * p->sys.scaling_coefficients[L2_WM];
-  sample_perf_counters[L2_RH] = read_hits;
-  sample_perf_counters[L2_RM] = read_misses;
-  sample_perf_counters[L2_WH] = write_hits;
-  sample_perf_counters[L2_WM] = write_misses;
-}
-
-void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
-  p->sys.num_idle_cores = num_idle_core;
-  sample_perf_counters[IDLE_CORE_N] = num_idle_core;
-}
-
-void gpgpu_sim_wrapper::set_duty_cycle_power(double duty_cycle) {
-  p->sys.core[0].pipeline_duty_cycle =
-      duty_cycle * p->sys.scaling_coefficients[PIPE_A];
-  sample_perf_counters[PIPE_A] = duty_cycle;
-}
-
-void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
-                                           double dram_precharge) {
-  p->sys.mc.memory_accesses = reads * p->sys.scaling_coefficients[MEM_RD] +
-                              writes * p->sys.scaling_coefficients[MEM_WR];
-  p->sys.mc.memory_reads = reads * p->sys.scaling_coefficients[MEM_RD];
-  p->sys.mc.memory_writes = writes * p->sys.scaling_coefficients[MEM_WR];
-  p->sys.mc.dram_pre = dram_precharge * p->sys.scaling_coefficients[MEM_PRE];
-  sample_perf_counters[MEM_RD] = reads;
-  sample_perf_counters[MEM_WR] = writes;
-  sample_perf_counters[MEM_PRE] = dram_precharge;
-}
-
-void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
-                                            double ialu_accesses,
-                                            double sfu_accesses) {
-  p->sys.core[0].fpu_accesses =
-      fpu_accesses * p->sys.scaling_coefficients[FPU_ACC];
-  // Integer ALU (not present in Tesla)
-  p->sys.core[0].ialu_accesses =
-      ialu_accesses * p->sys.scaling_coefficients[SP_ACC];
-  // Sfu accesses
-  p->sys.core[0].mul_accesses =
-      sfu_accesses * p->sys.scaling_coefficients[SFU_ACC];
-
-  sample_perf_counters[SP_ACC] = ialu_accesses;
-  sample_perf_counters[SFU_ACC] = sfu_accesses;
-  sample_perf_counters[FPU_ACC] = fpu_accesses;
-}
-
-void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
-                                               double sfu_avg_active_lane) {
-  p->sys.core[0].sp_average_active_lanes = sp_avg_active_lane;
-  p->sys.core[0].sfu_average_active_lanes = sfu_avg_active_lane;
-}
-
-void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_reads,
-                                      double noc_tot_writes) {
-  p->sys.NoC[0].total_accesses =
-      noc_tot_reads * p->sys.scaling_coefficients[NOC_A] +
-      noc_tot_writes * p->sys.scaling_coefficients[NOC_A];
-  sample_perf_counters[NOC_A] = noc_tot_reads + noc_tot_writes;
-}
-
-void gpgpu_sim_wrapper::power_metrics_calculations() {
-  total_sample_count++;
-  kernel_sample_count++;
-
-  // Current sample power
-  double sample_power =
-      proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONST_DYNAMICP];
-
-  // Average power
-  // Previous + new + constant dynamic power (e.g., dynamic clocking power)
-  kernel_tot_power += sample_power;
-  kernel_power.avg = kernel_tot_power / kernel_sample_count;
-  for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-    kernel_cmp_pwr[ind].avg += (double)sample_cmp_pwr[ind];
-  }
-
-  for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-    kernel_cmp_perf_counters[ind].avg += (double)sample_perf_counters[ind];
-  }
-
-  // Max Power
-  if (sample_power > kernel_power.max) {
-    kernel_power.max = sample_power;
-    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-      kernel_cmp_pwr[ind].max = (double)sample_cmp_pwr[ind];
-    }
-    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-      kernel_cmp_perf_counters[ind].max = sample_perf_counters[ind];
-    }
-  }
-
-  // Min Power
-  if (sample_power < kernel_power.min || (kernel_power.min == 0)) {
-    kernel_power.min = sample_power;
-    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-      kernel_cmp_pwr[ind].min = (double)sample_cmp_pwr[ind];
-    }
-    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-      kernel_cmp_perf_counters[ind].min = sample_perf_counters[ind];
-    }
-  }
-
-  gpu_tot_power.avg = (gpu_tot_power.avg + sample_power);
-  gpu_tot_power.max =
-      (sample_power > gpu_tot_power.max) ? sample_power : gpu_tot_power.max;
-  gpu_tot_power.min =
-      ((sample_power < gpu_tot_power.min) || (gpu_tot_power.min == 0))
-          ? sample_power
-          : gpu_tot_power.min;
-}
-
-void gpgpu_sim_wrapper::print_trace_files() {
-  open_files();
-
-  for (unsigned i = 0; i < num_perf_counters; ++i) {
-    gzprintf(metric_trace_file, "%f,", sample_perf_counters[i]);
-  }
-  gzprintf(metric_trace_file, "\n");
-
-  gzprintf(power_trace_file, "%f,", proc_power);
-  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-    gzprintf(power_trace_file, "%f,", sample_cmp_pwr[i]);
-  }
-  gzprintf(power_trace_file, "\n");
-
-  close_files();
-}
-
-void gpgpu_sim_wrapper::update_coefficients() {
-  initpower_coeff[FP_INT] = proc->cores[0]->get_coefficient_fpint_insts();
-  effpower_coeff[FP_INT] =
-      initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
-
-  initpower_coeff[TOT_INST] = proc->cores[0]->get_coefficient_tot_insts();
-  effpower_coeff[TOT_INST] =
-      initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
-
-  initpower_coeff[REG_RD] =
-      proc->cores[0]->get_coefficient_regreads_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  initpower_coeff[REG_WR] =
-      proc->cores[0]->get_coefficient_regwrites_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  initpower_coeff[NON_REG_OPs] =
-      proc->cores[0]->get_coefficient_noregfileops_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  effpower_coeff[REG_RD] =
-      initpower_coeff[REG_RD] * p->sys.scaling_coefficients[REG_RD];
-  effpower_coeff[REG_WR] =
-      initpower_coeff[REG_WR] * p->sys.scaling_coefficients[REG_WR];
-  effpower_coeff[NON_REG_OPs] =
-      initpower_coeff[NON_REG_OPs] * p->sys.scaling_coefficients[NON_REG_OPs];
-
-  initpower_coeff[IC_H] = proc->cores[0]->get_coefficient_icache_hits();
-  initpower_coeff[IC_M] = proc->cores[0]->get_coefficient_icache_misses();
-  effpower_coeff[IC_H] =
-      initpower_coeff[IC_H] * p->sys.scaling_coefficients[IC_H];
-  effpower_coeff[IC_M] =
-      initpower_coeff[IC_M] * p->sys.scaling_coefficients[IC_M];
-
-  initpower_coeff[CC_H] = (proc->cores[0]->get_coefficient_ccache_readhits() +
-                           proc->get_coefficient_readcoalescing());
-  initpower_coeff[CC_M] = (proc->cores[0]->get_coefficient_ccache_readmisses() +
-                           proc->get_coefficient_readcoalescing());
-  effpower_coeff[CC_H] =
-      initpower_coeff[CC_H] * p->sys.scaling_coefficients[CC_H];
-  effpower_coeff[CC_M] =
-      initpower_coeff[CC_M] * p->sys.scaling_coefficients[CC_M];
-
-  initpower_coeff[TC_H] = (proc->cores[0]->get_coefficient_tcache_readhits() +
-                           proc->get_coefficient_readcoalescing());
-  initpower_coeff[TC_M] = (proc->cores[0]->get_coefficient_tcache_readmisses() +
-                           proc->get_coefficient_readcoalescing());
-  effpower_coeff[TC_H] =
-      initpower_coeff[TC_H] * p->sys.scaling_coefficients[TC_H];
-  effpower_coeff[TC_M] =
-      initpower_coeff[TC_M] * p->sys.scaling_coefficients[TC_M];
-
-  initpower_coeff[SHRD_ACC] =
-      proc->cores[0]->get_coefficient_sharedmemory_readhits();
-  effpower_coeff[SHRD_ACC] =
-      initpower_coeff[SHRD_ACC] * p->sys.scaling_coefficients[SHRD_ACC];
-
-  initpower_coeff[DC_RH] = (proc->cores[0]->get_coefficient_dcache_readhits() +
-                            proc->get_coefficient_readcoalescing());
-  initpower_coeff[DC_RM] =
-      (proc->cores[0]->get_coefficient_dcache_readmisses() +
-       proc->get_coefficient_readcoalescing());
-  initpower_coeff[DC_WH] = (proc->cores[0]->get_coefficient_dcache_writehits() +
-                            proc->get_coefficient_writecoalescing());
-  initpower_coeff[DC_WM] =
-      (proc->cores[0]->get_coefficient_dcache_writemisses() +
-       proc->get_coefficient_writecoalescing());
-  effpower_coeff[DC_RH] =
-      initpower_coeff[DC_RH] * p->sys.scaling_coefficients[DC_RH];
-  effpower_coeff[DC_RM] =
-      initpower_coeff[DC_RM] * p->sys.scaling_coefficients[DC_RM];
-  effpower_coeff[DC_WH] =
-      initpower_coeff[DC_WH] * p->sys.scaling_coefficients[DC_WH];
-  effpower_coeff[DC_WM] =
-      initpower_coeff[DC_WM] * p->sys.scaling_coefficients[DC_WM];
-
-  initpower_coeff[L2_RH] = proc->get_coefficient_l2_read_hits();
-  initpower_coeff[L2_RM] = proc->get_coefficient_l2_read_misses();
-  initpower_coeff[L2_WH] = proc->get_coefficient_l2_write_hits();
-  initpower_coeff[L2_WM] = proc->get_coefficient_l2_write_misses();
-  effpower_coeff[L2_RH] =
-      initpower_coeff[L2_RH] * p->sys.scaling_coefficients[L2_RH];
-  effpower_coeff[L2_RM] =
-      initpower_coeff[L2_RM] * p->sys.scaling_coefficients[L2_RM];
-  effpower_coeff[L2_WH] =
-      initpower_coeff[L2_WH] * p->sys.scaling_coefficients[L2_WH];
-  effpower_coeff[L2_WM] =
-      initpower_coeff[L2_WM] * p->sys.scaling_coefficients[L2_WM];
-
-  initpower_coeff[IDLE_CORE_N] =
-      p->sys.idle_core_power * proc->cores[0]->executionTime;
-  effpower_coeff[IDLE_CORE_N] =
-      initpower_coeff[IDLE_CORE_N] * p->sys.scaling_coefficients[IDLE_CORE_N];
-
-  initpower_coeff[PIPE_A] = proc->cores[0]->get_coefficient_duty_cycle();
-  effpower_coeff[PIPE_A] =
-      initpower_coeff[PIPE_A] * p->sys.scaling_coefficients[PIPE_A];
-
-  initpower_coeff[MEM_RD] = proc->get_coefficient_mem_reads();
-  initpower_coeff[MEM_WR] = proc->get_coefficient_mem_writes();
-  initpower_coeff[MEM_PRE] = proc->get_coefficient_mem_pre();
-  effpower_coeff[MEM_RD] =
-      initpower_coeff[MEM_RD] * p->sys.scaling_coefficients[MEM_RD];
-  effpower_coeff[MEM_WR] =
-      initpower_coeff[MEM_WR] * p->sys.scaling_coefficients[MEM_WR];
-  effpower_coeff[MEM_PRE] =
-      initpower_coeff[MEM_PRE] * p->sys.scaling_coefficients[MEM_PRE];
-
-  initpower_coeff[SP_ACC] =
-      proc->cores[0]->get_coefficient_ialu_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  ;
-  initpower_coeff[SFU_ACC] = proc->cores[0]->get_coefficient_sfu_accesses();
-  initpower_coeff[FPU_ACC] = proc->cores[0]->get_coefficient_fpu_accesses();
-
-  effpower_coeff[SP_ACC] =
-      initpower_coeff[SP_ACC] * p->sys.scaling_coefficients[SP_ACC];
-  effpower_coeff[SFU_ACC] =
-      initpower_coeff[SFU_ACC] * p->sys.scaling_coefficients[SFU_ACC];
-  effpower_coeff[FPU_ACC] =
-      initpower_coeff[FPU_ACC] * p->sys.scaling_coefficients[FPU_ACC];
-
-  initpower_coeff[NOC_A] = proc->get_coefficient_noc_accesses();
-  effpower_coeff[NOC_A] =
-      initpower_coeff[NOC_A] * p->sys.scaling_coefficients[NOC_A];
-
-  const_dynamic_power =
-      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
-
-  for (unsigned i = 0; i < num_perf_counters; i++) {
-    initpower_coeff[i] /= (proc->cores[0]->executionTime);
-    effpower_coeff[i] /= (proc->cores[0]->executionTime);
-  }
-}
-
-void gpgpu_sim_wrapper::update_components_power() {
-  update_coefficients();
-
-  proc_power = proc->rt_power.readOp.dynamic;
-
-  sample_cmp_pwr[IBP] =
-      (proc->cores[0]->ifu->IB->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic +
-       proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic) /
-      (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[ICP] = proc->cores[0]->ifu->icache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[DCP] = proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[TCP] = proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[CCP] = proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[SHRDP] =
-      proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic /
-      (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[RFP] =
-      (proc->cores[0]->exu->rfu->rt_power.readOp.dynamic /
-       (proc->cores[0]->executionTime)) *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-
-  sample_cmp_pwr[SPP] =
-      (proc->cores[0]->exu->exeu->rt_power.readOp.dynamic /
-       (proc->cores[0]->executionTime)) *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-
-  sample_cmp_pwr[SFUP] = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic /
-                          (proc->cores[0]->executionTime));
-
-  sample_cmp_pwr[FPUP] = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic /
-                          (proc->cores[0]->executionTime));
-
-  sample_cmp_pwr[SCHEDP] = proc->cores[0]->exu->scheu->rt_power.readOp.dynamic /
-                           (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[L2CP] = (proc->XML->sys.number_of_L2s > 0)
-                             ? proc->l2array[0]->rt_power.readOp.dynamic /
-                                   (proc->cores[0]->executionTime)
-                             : 0;
-
-  sample_cmp_pwr[MCP] = (proc->mc->rt_power.readOp.dynamic -
-                         proc->mc->dram->rt_power.readOp.dynamic) /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[NOCP] =
-      proc->nocs[0]->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[DRAMP] =
-      proc->mc->dram->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[PIPEP] =
-      proc->cores[0]->Pipeline_energy / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[IDLE_COREP] =
-      proc->cores[0]->IdleCoreEnergy / (proc->cores[0]->executionTime);
-
-  // This constant dynamic power (e.g., clock power) part is estimated via
-  // regression model.
-  sample_cmp_pwr[CONST_DYNAMICP] = 0;
-  double cnst_dyn =
-      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
-  // If the regression scaling term is greater than the recorded constant
-  // dynamic power then use the difference (other portion already added to
-  // dynamic power). Else, all the constant dynamic power is accounted for, add
-  // nothing.
-  if (p->sys.scaling_coefficients[CONST_DYNAMICN] > cnst_dyn)
-    sample_cmp_pwr[CONST_DYNAMICP] =
-        (p->sys.scaling_coefficients[CONST_DYNAMICN] - cnst_dyn);
-
-  proc_power += sample_cmp_pwr[CONST_DYNAMICP];
-
-  double sum_pwr_cmp = 0;
-  for (unsigned i = 0; i < num_pwr_cmps; i++) {
-    sum_pwr_cmp += sample_cmp_pwr[i];
-  }
-  bool check = false;
-  check = sanity_check(sum_pwr_cmp, proc_power);
-  assert("Total Power does not equal the sum of the components\n" && (check));
-}
-
-void gpgpu_sim_wrapper::compute() { proc->compute(); }
-void gpgpu_sim_wrapper::print_power_kernel_stats(
-    double gpu_sim_cycle, double gpu_tot_sim_cycle, double init_value,
-    const std::string& kernel_info_string, bool print_trace) {
-  detect_print_steady_state(1, init_value);
-  if (g_power_simulation_enabled) {
-    powerfile << kernel_info_string << std::endl;
-
-    sanity_check((kernel_power.avg * kernel_sample_count), kernel_tot_power);
-    powerfile << "Kernel Average Power Data:" << std::endl;
-    powerfile << "kernel_avg_power = " << kernel_power.avg << std::endl;
-
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_avg_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].avg / kernel_sample_count << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_avg_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].avg / kernel_sample_count
-                << std::endl;
-    }
-
-    powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
-    powerfile << "kernel_max_power = " << kernel_power.max << std::endl;
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_max_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].max << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_max_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].max << std::endl;
-    }
-
-    powerfile << std::endl << "Kernel Minimum Power Data:" << std::endl;
-    powerfile << "kernel_min_power = " << kernel_power.min << std::endl;
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_min_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].min << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_min_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].min << std::endl;
-    }
-
-    powerfile << std::endl
-              << "Accumulative Power Statistics Over Previous Kernels:"
-              << std::endl;
-    powerfile << "gpu_tot_avg_power = "
-              << gpu_tot_power.avg / total_sample_count << std::endl;
-    powerfile << "gpu_tot_max_power = " << gpu_tot_power.max << std::endl;
-    powerfile << "gpu_tot_min_power = " << gpu_tot_power.min << std::endl;
-    powerfile << std::endl << std::endl;
-    powerfile.flush();
-
-    if (print_trace) {
-      print_trace_files();
-    }
-  }
-}
-void gpgpu_sim_wrapper::dump() {
-  if (g_power_per_cycle_dump) proc->displayEnergy(2, 5);
-}
-
-void gpgpu_sim_wrapper::print_steady_state(int position, double init_val) {
-  double temp_avg = sample_val / (double)samples.size();
-  double temp_ipc = (init_val - init_inst_val) /
-                    (double)(samples.size() * gpu_stat_sample_freq);
-
-  if ((samples.size() >
-       gpu_steady_min_period)) {  // If steady state occurred for some time,
-                                  // print to file
-    has_written_avg = true;
-    gzprintf(steady_state_tacking_file, "%u,%d,%f,%f,", sample_start,
-             total_sample_count, temp_avg, temp_ipc);
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      gzprintf(steady_state_tacking_file, "%f,",
-               samples_counter.at(i) / ((double)samples.size()));
-    }
-    gzprintf(steady_state_tacking_file, "\n");
-  } else {
-    if (!has_written_avg && position)
-      gzprintf(steady_state_tacking_file,
-               "ERROR! Not enough steady state points to generate average\n");
-  }
-
-  sample_start = 0;
-  sample_val = 0;
-  init_inst_val = init_val;
-  samples.clear();
-  samples_counter.clear();
-  pwr_counter.clear();
-  assert(samples.size() == 0);
-}
-
-void gpgpu_sim_wrapper::detect_print_steady_state(int position,
-                                                  double init_val) {
-  // Calculating Average
-  if (g_power_simulation_enabled && g_steady_power_levels_enabled) {
-    steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "a");
-    if (position == 0) {
-      if (samples.size() == 0) {
-        // First sample
-        sample_start = total_sample_count;
-        sample_val = proc->rt_power.readOp.dynamic;
-        init_inst_val = init_val;
-        samples.push_back(proc->rt_power.readOp.dynamic);
-        assert(samples_counter.size() == 0);
-        assert(pwr_counter.size() == 0);
-
-        for (unsigned i = 0; i < (num_perf_counters); ++i) {
-          samples_counter.push_back(sample_perf_counters[i]);
-        }
-
-        for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
-          pwr_counter.push_back(sample_cmp_pwr[i]);
-        }
-        assert(pwr_counter.size() == (double)num_pwr_cmps);
-        assert(samples_counter.size() == (double)num_perf_counters);
-      } else {
-        // Get current average
-        double temp_avg = sample_val / (double)samples.size();
-
-        if (abs(proc->rt_power.readOp.dynamic - temp_avg) <
-            gpu_steady_power_deviation) {  // Value is within threshold
-          sample_val += proc->rt_power.readOp.dynamic;
-          samples.push_back(proc->rt_power.readOp.dynamic);
-          for (unsigned i = 0; i < (num_perf_counters); ++i) {
-            samples_counter.at(i) += sample_perf_counters[i];
-          }
-
-          for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
-            pwr_counter.at(i) += sample_cmp_pwr[i];
-          }
-
-        } else {  // Value exceeds threshold, not considered steady state
-          print_steady_state(position, init_val);
-        }
-      }
-    } else {
-      print_steady_state(position, init_val);
-    }
-    gzclose(steady_state_tacking_file);
-  }
-}
-
-void gpgpu_sim_wrapper::open_files() {
-  if (g_power_simulation_enabled) {
-    if (g_power_trace_enabled) {
-      power_trace_file = gzopen(g_power_trace_filename, "a");
-      metric_trace_file = gzopen(g_metric_trace_filename, "a");
-    }
-  }
-}
-void gpgpu_sim_wrapper::close_files() {
-  if (g_power_simulation_enabled) {
-    if (g_power_trace_enabled) {
-      gzclose(power_trace_file);
-      gzclose(metric_trace_file);
-    }
-  }
-}
diff --git a/version b/version
index c832e567c..09e18b115 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.1.0 ";
+const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.2.0 ";

From f9b39ee0e9f63a19ecc3b137c51b12625c28a037 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 10 Aug 2024 00:09:27 +0800
Subject: [PATCH 093/133] mee sub partition v0.1

---
 .../SM7_QV100/base/accelwattch_ptx_sim.xml    | 623 ++++++++++++++++++
 .../base/accelwattch_ptx_sim_alt.xml          | 623 ++++++++++++++++++
 .../SM7_QV100/base/accelwattch_sass_hw.xml    | 613 +++++++++++++++++
 .../base/accelwattch_sass_hybrid.xml          | 613 +++++++++++++++++
 .../SM7_QV100/base/accelwattch_sass_sim.xml   | 613 +++++++++++++++++
 .../base/accelwattch_sass_sim_alt.xml         | 613 +++++++++++++++++
 .../SM7_QV100/base/config_volta_islip.icnt    |  74 +++
 .../SM7_QV100/base/gpgpusim.config_base       | 244 +++++++
 src/abstract_hardware_model.h                 |   2 +-
 src/gpgpu-sim/gpu-cache.h                     |   8 +
 src/gpgpu-sim/gpu-sim.cc                      |   8 +-
 src/gpgpu-sim/gpu-sim.h                       |   2 +
 src/gpgpu-sim/l2cache.cc                      | 141 +++-
 src/gpgpu-sim/l2cache.h                       |  73 +-
 src/gpgpu-sim/mee.cc                          | 264 ++++++++
 src/gpgpu-sim/mee.h                           |  67 ++
 16 files changed, 4531 insertions(+), 50 deletions(-)
 create mode 100644 configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
 create mode 100644 src/gpgpu-sim/mee.cc
 create mode 100644 src/gpgpu-sim/mee.h

diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base b/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
new file mode 100644
index 000000000..a4535ba80
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
@@ -0,0 +1,244 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 5000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dmeta S:16:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index f04741f75..dee38e86f 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -767,7 +767,7 @@ typedef std::bitset<SECTOR_CHUNCK_SIZE> mem_access_sector_mask_t;
   MA_TUP(GLOBAL_ACC_R), MA_TUP(LOCAL_ACC_R), MA_TUP(CONST_ACC_R),       \
       MA_TUP(TEXTURE_ACC_R), MA_TUP(GLOBAL_ACC_W), MA_TUP(LOCAL_ACC_W), \
       MA_TUP(L1_WRBK_ACC), MA_TUP(L2_WRBK_ACC), MA_TUP(INST_ACC_R),     \
-      MA_TUP(L1_WR_ALLOC_R), MA_TUP(L2_WR_ALLOC_R),                     \
+      MA_TUP(L1_WR_ALLOC_R), MA_TUP(L2_WR_ALLOC_R), MA_TUP(META_ACC),   \
       MA_TUP(NUM_MEM_ACCESS_TYPE) MA_TUP_END(mem_access_type)
 
 #define MA_TUP_BEGIN(X) enum X {
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 498dfebd0..2688f5676 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1002,6 +1002,10 @@ class tag_array {
 
   typedef tr1_hash_map<new_addr_type, unsigned> line_table;
   line_table pending_lines;
+
+  friend class basline_cache;
+  friend class l2_cache;
+  friend class sub_mee;
 };
 
 class mshr_table {
@@ -1436,6 +1440,10 @@ class baseline_cache : public cache_t {
   };
 
   bandwidth_management m_bandwidth_management;
+
+  friend class l2_cache;
+  friend class data_cache;
+  friend class sub_mee;
 };
 
 /// Read only cache
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index e44551ee3..8369f6e61 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -237,6 +237,12 @@ void memory_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq>}",
                          "64:128:8,L:B:m:N,A:16:4,4");
+  option_parser_register(opp, "-gpgpu_cache:dmeta", OPT_CSTR,
+                         &m_META_config.m_config_string,
+                         "unified banked META data cache config "
+                         " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
+                         "alloc>,<mshr>:<N>:<merge>,<mq>}",
+                         "64:128:8,L:B:m:N,A:16:4,4");
   option_parser_register(opp, "-gpgpu_cache:dl2_texture_only", OPT_BOOL,
                          &m_L2_texure_only, "L2 cache used for texture only",
                          "1");
@@ -2024,7 +2030,7 @@ void gpgpu_sim::cycle() {
         if (m_memory_config->m_L2_config.get_num_lines()) {
           int dlc = 0;
           for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
-            dlc = m_memory_sub_partition[i]->flushL2();
+            dlc = m_memory_sub_partition[i]->flushL2();//TODO
             assert(dlc == 0);  // TODO: need to model actual writes to DRAM here
             printf("Dirty lines flushed from L2 %d is %d\n", i, dlc);
           }
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 68b3dfa10..731f5e8e6 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -265,6 +265,7 @@ class memory_config {
 
     m_address_mapping.init(m_n_mem, m_n_sub_partition_per_memory_channel);
     m_L2_config.init(&m_address_mapping);
+    m_META_config.init(&m_address_mapping);
 
     m_valid = true;
 
@@ -276,6 +277,7 @@ class memory_config {
 
   bool m_valid;
   mutable l2_cache_config m_L2_config;
+  mutable l2_cache_config m_META_config;
   bool m_L2_texure_only;
 
   char *gpgpu_dram_timing_opt;
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 511c15efa..06f5a662f 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -38,10 +38,10 @@
 #include "../option_parser.h"
 #include "../statwrapper.h"
 #include "dram.h"
-#include "gpu-cache.h"
 #include "gpu-sim.h"
 #include "histogram.h"
 #include "l2cache.h"
+#include "mee.h"
 #include "l2cache_trace.h"
 #include "mem_fetch.h"
 #include "mem_latency_stat.h"
@@ -214,7 +214,7 @@ void memory_partition_unit::visualizer_print(gzFile visualizer_file) const {
 // determine whether a given subpartition can issue to DRAM
 bool memory_partition_unit::can_issue_to_dram(int inner_sub_partition_id) {
   int spid = inner_sub_partition_id;
-  bool sub_partition_contention = m_sub_partition[spid]->dram_L2_queue_full();
+  bool sub_partition_contention = m_sub_partition[spid]->dram_mee_queue_full();
   bool has_dram_resource = m_arbitration_metadata.has_credits(spid);
 
   MEMPART_DPRINTF(
@@ -245,12 +245,12 @@ void memory_partition_unit::simple_dram_model_cycle() {
       unsigned dest_global_spid = mf_return->get_sub_partition_id();
       int dest_spid = global_sub_partition_id_to_local_id(dest_global_spid);
       assert(m_sub_partition[dest_spid]->get_id() == dest_global_spid);
-      if (!m_sub_partition[dest_spid]->dram_L2_queue_full()) {
+      if (!m_sub_partition[dest_spid]->dram_mee_queue_full()) {
         if (mf_return->get_access_type() == L1_WRBK_ACC) {
           m_sub_partition[dest_spid]->set_done(mf_return);
           delete mf_return;
         } else {
-          m_sub_partition[dest_spid]->dram_L2_queue_push(mf_return);
+          m_sub_partition[dest_spid]->dram_mee_queue_push(mf_return);
           mf_return->set_status(
               IN_PARTITION_DRAM_TO_L2_QUEUE,
               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
@@ -278,12 +278,12 @@ void memory_partition_unit::simple_dram_model_cycle() {
        p++) {
     int spid = (p + last_issued_partition + 1) %
                m_config->m_n_sub_partition_per_memory_channel;
-    if (!m_sub_partition[spid]->L2_dram_queue_empty() &&
+    if (!m_sub_partition[spid]->mee_dram_queue_empty() &&
         can_issue_to_dram(spid)) {
-      mem_fetch *mf = m_sub_partition[spid]->L2_dram_queue_top();
+      mem_fetch *mf = m_sub_partition[spid]->mee_dram_queue_top();
       if (m_dram->full(mf->is_write())) break;
 
-      m_sub_partition[spid]->L2_dram_queue_pop();
+      m_sub_partition[spid]->mee_dram_queue_pop();
       MEMPART_DPRINTF(
           "Issue mem_fetch request %p from sub partition %d to dram\n", mf,
           spid);
@@ -309,12 +309,12 @@ void memory_partition_unit::dram_cycle() {
     unsigned dest_global_spid = mf_return->get_sub_partition_id();
     int dest_spid = global_sub_partition_id_to_local_id(dest_global_spid);
     assert(m_sub_partition[dest_spid]->get_id() == dest_global_spid);
-    if (!m_sub_partition[dest_spid]->dram_L2_queue_full()) {
+    if (!m_sub_partition[dest_spid]->dram_mee_queue_full()) {
       if (mf_return->get_access_type() == L1_WRBK_ACC) {
         m_sub_partition[dest_spid]->set_done(mf_return);
         delete mf_return;
       } else {
-        m_sub_partition[dest_spid]->dram_L2_queue_push(mf_return);
+        m_sub_partition[dest_spid]->dram_mee_queue_push(mf_return);
         mf_return->set_status(IN_PARTITION_DRAM_TO_L2_QUEUE,
                               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
         m_arbitration_metadata.return_credit(dest_spid);
@@ -340,12 +340,12 @@ void memory_partition_unit::dram_cycle() {
        p++) {
     int spid = (p + last_issued_partition + 1) %
                m_config->m_n_sub_partition_per_memory_channel;
-    if (!m_sub_partition[spid]->L2_dram_queue_empty() &&
+    if (!m_sub_partition[spid]->mee_dram_queue_empty() &&
         can_issue_to_dram(spid)) {
-      mem_fetch *mf = m_sub_partition[spid]->L2_dram_queue_top();
+      mem_fetch *mf = m_sub_partition[spid]->mee_dram_queue_top();
       if (m_dram->full(mf->is_write())) break;
 
-      m_sub_partition[spid]->L2_dram_queue_pop();
+      m_sub_partition[spid]->mee_dram_queue_pop();
       MEMPART_DPRINTF(
           "Issue mem_fetch request %p from sub partition %d to dram\n", mf,
           spid);
@@ -370,6 +370,11 @@ void memory_partition_unit::dram_cycle() {
     mem_fetch *mf = m_dram_latency_queue.front().req;
     m_dram_latency_queue.pop_front();
     m_dram->push(mf);
+    // if (mf->get_sub_partition_id() == 0)
+      // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "to dram", mf_return->get_addr(), mf_return->get_sub_partition_id(), mf_return->get_partition_addr(), mf_return->get_access_type());
+    
+      // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "to dram", mf->get_addr(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type());
+
   }
 }
 
@@ -430,12 +435,25 @@ memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
   char L2c_name[32];
   snprintf(L2c_name, 32, "L2_bank_%03d", m_id);
   m_L2interface = new L2interface(this);
+  m_metainterface = new metainterface(this);
   m_mf_allocator = new partition_mf_allocator(config);
 
-  if (!m_config->m_L2_config.disabled())
+  if (!m_config->m_L2_config.disabled()) {
     m_L2cache =
         new l2_cache(L2c_name, m_config->m_L2_config, -1, -1, m_L2interface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_CTRcache =
+        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_MACcache =
+        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_BMTcache =
+        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+  }
+
+  m_sub_mee = new sub_mee(this, m_CTRcache, m_MACcache, m_BMTcache, m_config, m_gpu);
 
   unsigned int icnt_L2;
   unsigned int L2_dram;
@@ -444,19 +462,24 @@ memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
   sscanf(m_config->gpgpu_L2_queue_config, "%u:%u:%u:%u", &icnt_L2, &L2_dram,
          &dram_L2, &L2_icnt);
   m_icnt_L2_queue = new fifo_pipeline<mem_fetch>("icnt-to-L2", 0, icnt_L2);
-  m_L2_dram_queue = new fifo_pipeline<mem_fetch>("L2-to-dram", 0, L2_dram);
-  m_dram_L2_queue = new fifo_pipeline<mem_fetch>("dram-to-L2", 0, dram_L2);
+  m_L2_mee_queue = new fifo_pipeline<mem_fetch>("L2-to-mee", 0, L2_dram);
+  m_mee_dram_queue = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
+  m_dram_mee_queue = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, dram_L2);
+  m_mee_L2_queue = new fifo_pipeline<mem_fetch>("mee-to-L2", 0, dram_L2);
   m_L2_icnt_queue = new fifo_pipeline<mem_fetch>("L2-to-icnt", 0, L2_icnt);
   wb_addr = -1;
 }
 
 memory_sub_partition::~memory_sub_partition() {
   delete m_icnt_L2_queue;
-  delete m_L2_dram_queue;
-  delete m_dram_L2_queue;
+  delete m_L2_mee_queue;
+  delete m_mee_L2_queue;
   delete m_L2_icnt_queue;
   delete m_L2cache;
+  delete m_CTRcache;
   delete m_L2interface;
+  delete m_metainterface;
+  delete m_sub_mee;
 }
 
 void memory_sub_partition::cache_cycle(unsigned cycle) {
@@ -488,30 +511,32 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   }
 
   // DRAM to L2 (texture) and icnt (not texture)
-  if (!m_dram_L2_queue->empty()) {
-    mem_fetch *mf = m_dram_L2_queue->top();
+  if (!m_mee_L2_queue->empty()) {
+    mem_fetch *mf = m_mee_L2_queue->top();
     if (!m_config->m_L2_config.disabled() && m_L2cache->waiting_for_fill(mf)) {
       if (m_L2cache->fill_port_free()) {
         mf->set_status(IN_PARTITION_L2_FILL_QUEUE,
                        m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
         m_L2cache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                 m_memcpy_cycle_offset);
-        m_dram_L2_queue->pop();
+        m_mee_L2_queue->pop();
       }
     } else if (!m_L2_icnt_queue->full()) {
       if (mf->is_write() && mf->get_type() == WRITE_ACK)
         mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
                        m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
       m_L2_icnt_queue->push(mf);
-      m_dram_L2_queue->pop();
+      m_mee_L2_queue->pop();
     }
   }
 
-  // prior L2 misses inserted into m_L2_dram_queue here
+  m_sub_mee->simple_cycle(cycle);
+  
+  // prior L2 misses inserted into m_L2_mee_queue here
   if (!m_config->m_L2_config.disabled()) m_L2cache->cycle();
 
   // new L2 texture accesses and/or non-texture accesses
-  if (!m_L2_dram_queue->full() && !m_icnt_L2_queue->empty()) {
+  if (!m_L2_mee_queue->full() && !m_icnt_L2_queue->empty()) {
     mem_fetch *mf = m_icnt_L2_queue->top();
     if (!m_config->m_L2_config.disabled() &&
         ((m_config->m_L2_texure_only && mf->istexture()) ||
@@ -577,7 +602,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
       // L2 is disabled or non-texture access to texture-only L2
       mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE,
                      m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-      m_L2_dram_queue->push(mf);
+      m_L2_mee_queue->push(mf);
       m_icnt_L2_queue->pop();
     }
   }
@@ -599,22 +624,66 @@ bool memory_sub_partition::full(unsigned size) const {
   return m_icnt_L2_queue->is_avilable_size(size);
 }
 
-bool memory_sub_partition::L2_dram_queue_empty() const {
-  return m_L2_dram_queue->empty();
+// interface to L2_mee_queue
+
+bool memory_sub_partition::L2_mee_queue_empty() const {
+  return m_L2_mee_queue->empty(); // TODO
+}
+
+class mem_fetch *memory_sub_partition::L2_mee_queue_top() const {
+  return m_L2_mee_queue->top(); // TODO
+}
+
+void memory_sub_partition::L2_mee_queue_pop() { m_L2_mee_queue->pop(); } // TODO
+
+// interface to mee_dram_queue
+
+bool memory_sub_partition::mee_dram_queue_empty() const {
+  return m_mee_dram_queue->empty(); // TODO
+}
+
+class mem_fetch *memory_sub_partition::mee_dram_queue_top() const {
+  return m_mee_dram_queue->top(); // TODO
+}
+
+void memory_sub_partition::mee_dram_queue_pop() { m_mee_dram_queue->pop(); } // TODO
+
+bool memory_sub_partition::mee_dram_queue_full() const {
+  return m_mee_dram_queue->full(); //TODO
+}
+
+void memory_sub_partition::mee_dram_queue_push(class mem_fetch *mf) {
+  m_mee_dram_queue->push(mf); //TODO
+}
+
+// interface to dram_mee_queue
+
+bool memory_sub_partition::dram_mee_queue_empty() const {
+  return m_dram_mee_queue->empty(); // TODO
+}
+
+class mem_fetch *memory_sub_partition::dram_mee_queue_top() const {
+  return m_dram_mee_queue->top(); // TODO
+}
+
+void memory_sub_partition::dram_mee_queue_pop() { m_dram_mee_queue->pop(); } // TODO
+
+bool memory_sub_partition::dram_mee_queue_full() const {
+  return m_dram_mee_queue->full(); //TODO
 }
 
-class mem_fetch *memory_sub_partition::L2_dram_queue_top() const {
-  return m_L2_dram_queue->top();
+void memory_sub_partition::dram_mee_queue_push(class mem_fetch *mf) {
+  m_dram_mee_queue->push(mf); //TODO
 }
 
-void memory_sub_partition::L2_dram_queue_pop() { m_L2_dram_queue->pop(); }
+// interface to mee_L2_queue
 
-bool memory_sub_partition::dram_L2_queue_full() const {
-  return m_dram_L2_queue->full();
+bool memory_sub_partition::mee_L2_queue_full() const {
+  return m_mee_L2_queue->full(); //TODO
 }
 
-void memory_sub_partition::dram_L2_queue_push(class mem_fetch *mf) {
-  m_dram_L2_queue->push(mf);
+void memory_sub_partition::mee_L2_queue_push(class mem_fetch *mf) {
+  m_mee_L2_queue->push(mf); //TODO
 }
 
 void memory_sub_partition::print_cache_stat(unsigned &accesses,
@@ -697,14 +766,16 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
 
 unsigned memory_sub_partition::flushL2() {
   if (!m_config->m_L2_config.disabled()) {
-    m_L2cache->flush();
+    m_L2cache->flush();//TODO
+    m_CTRcache->flush();
   }
   return 0;  // TODO: write the flushed data to the main memory
 }
 
 unsigned memory_sub_partition::invalidateL2() {
   if (!m_config->m_L2_config.disabled()) {
-    m_L2cache->invalidate();
+    m_L2cache->invalidate();//TODO
+    m_CTRcache->invalidate();
   }
   return 0;
 }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 902a4b7c0..4d640c995 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -32,6 +32,7 @@
 
 #include "../abstract_hardware_model.h"
 #include "dram.h"
+#include "gpu-cache.h"
 
 #include <list>
 #include <queue>
@@ -154,6 +155,8 @@ class memory_partition_unit {
   std::list<dram_delay_t> m_dram_latency_queue;
 
   class gpgpu_sim *m_gpu;
+
+  friend class sub_mee;
 };
 
 class memory_sub_partition {
@@ -178,14 +181,30 @@ class memory_sub_partition {
   unsigned flushL2();
   unsigned invalidateL2();
 
-  // interface to L2_dram_queue
-  bool L2_dram_queue_empty() const;
-  class mem_fetch *L2_dram_queue_top() const;
-  void L2_dram_queue_pop();
+  // interface to L2_mee_queue
+  bool L2_mee_queue_empty() const;
+  class mem_fetch *L2_mee_queue_top() const;
+  void L2_mee_queue_pop();
+
+  // interface to mee_dram_queue
+  bool mee_dram_queue_full() const;
+  void mee_dram_queue_push(class mem_fetch *mf);
+
+  bool mee_dram_queue_empty() const;
+  class mem_fetch *mee_dram_queue_top() const;
+  void mee_dram_queue_pop();
+
+  // interface to dram_mee_queue
+  bool dram_mee_queue_full() const;
+  void dram_mee_queue_push(class mem_fetch *mf);
+
+  bool dram_mee_queue_empty() const;
+  class mem_fetch *dram_mee_queue_top() const;
+  void dram_mee_queue_pop();
 
-  // interface to dram_L2_queue
-  bool dram_L2_queue_full() const;
-  void dram_L2_queue_push(class mem_fetch *mf);
+  // interface to mee_L2_queue
+  bool mee_L2_queue_full() const;
+  void mee_L2_queue_push(class mem_fetch *mf);
 
   void visualizer_print(gzFile visualizer_file);
   void print_cache_stat(unsigned &accesses, unsigned &misses) const;
@@ -203,13 +222,20 @@ class memory_sub_partition {
     m_L2cache->force_tag_access(addr, m_memcpy_cycle_offset + time, mask);
     m_memcpy_cycle_offset += 1;
   }
-
+  // class l2_cache *m_CTRcache;
+  std::vector<mem_fetch *> breakdown_request_to_sector_requests(mem_fetch *mf);
+  
  private:
   // data
   unsigned m_id;  //< the global sub partition ID
   const memory_config *m_config;
   class l2_cache *m_L2cache;
   class L2interface *m_L2interface;
+  class l2_cache *m_CTRcache;
+  class l2_cache *m_MACcache;
+  class l2_cache *m_BMTcache;
+  class sub_mee *m_sub_mee;
+  class metainterface *m_metainterface;
   class gpgpu_sim *m_gpu;
   partition_mf_allocator *m_mf_allocator;
 
@@ -222,8 +248,10 @@ class memory_sub_partition {
 
   // these are various FIFOs between units within a memory partition
   fifo_pipeline<mem_fetch> *m_icnt_L2_queue;
-  fifo_pipeline<mem_fetch> *m_L2_dram_queue;
-  fifo_pipeline<mem_fetch> *m_dram_L2_queue;
+  fifo_pipeline<mem_fetch> *m_L2_mee_queue;
+  fifo_pipeline<mem_fetch> *m_mee_dram_queue; 
+  fifo_pipeline<mem_fetch> *m_dram_mee_queue; 
+  fifo_pipeline<mem_fetch> *m_mee_L2_queue;
   fifo_pipeline<mem_fetch> *m_L2_icnt_queue;  // L2 cache hit response queue
 
   class mem_fetch *L2dramout;
@@ -234,8 +262,9 @@ class memory_sub_partition {
   std::set<mem_fetch *> m_request_tracker;
 
   friend class L2interface;
+  friend class metainterface;
 
-  std::vector<mem_fetch *> breakdown_request_to_sector_requests(mem_fetch *mf);
+  // std::vector<mem_fetch *> breakdown_request_to_sector_requests(mem_fetch *mf);
 
   // This is a cycle offset that has to be applied to the l2 accesses to account
   // for the cudamemcpy read/writes. We want GPGPU-Sim to only count cycles for
@@ -252,11 +281,29 @@ class L2interface : public mem_fetch_interface {
   virtual ~L2interface() {}
   virtual bool full(unsigned size, bool write) const {
     // assume read and write packets all same size
-    return m_unit->m_L2_dram_queue->full();
+    return m_unit->m_L2_mee_queue->full();
+  }
+  virtual void push(mem_fetch *mf) {
+    mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
+    m_unit->m_L2_mee_queue->push(mf);
+    // printf("l2 to mee access type: %d\n",mf->get_access_type());
+  }
+
+ private:
+  memory_sub_partition *m_unit;
+};
+
+class metainterface : public mem_fetch_interface {
+ public:
+  metainterface(memory_sub_partition *unit) { m_unit = unit; }
+  virtual ~metainterface() {}
+  virtual bool full(unsigned size, bool write) const {
+    // assume read and write packets all same size
+    return m_unit->m_mee_dram_queue->full();
   }
   virtual void push(mem_fetch *mf) {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
-    m_unit->m_L2_dram_queue->push(mf);
+    m_unit->m_mee_dram_queue->push(mf);
   }
 
  private:
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
new file mode 100644
index 000000000..56182216e
--- /dev/null
+++ b/src/gpgpu-sim/mee.cc
@@ -0,0 +1,264 @@
+#include "mee.h"
+#include <list>
+
+sub_mee::sub_mee(class memory_sub_partition *sub_partition, class l2_cache *CTRcache, class l2_cache *MACcache, class l2_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
+    m_sub_partition(sub_partition), 
+    m_CTRcache(CTRcache),
+    m_MACcache(MACcache),
+    m_BMTcache(BMTcache),
+    m_config(config),
+    m_gpu(gpu) {
+
+}
+int decode(int addr) {
+    return (addr & 16128) >> 8;
+}
+void sub_mee::print_addr(char s[], mem_fetch *mf) {
+    if (mf->get_sub_partition_id() == 0) {
+        printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\tmask_id: %d\tmask_addr:%x\taccess type:%d\n", s, mf->get_addr(), mf->get_sub_partition_id(), mf->get_partition_addr(), get_sub_partition_id(mf), get_partition_addr(mf), mf->get_access_type());
+        // print_tag();
+    }
+}
+
+void sub_mee::print_tag() {
+    // if (get_sub_partition_id(mf) == 0) {
+        // for (unsigned i = 0; i < m_config->m_META_config.get_num_lines(); i++) {
+        for (unsigned i = 188; i < 192; i++) {
+            printf("line %d:\t", i);
+            for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
+                // printf("%d\t", 
+                m_CTRcache->m_tag_array->m_lines[i]->print_status();
+            printf("\n");
+        }
+    // }
+}
+
+new_addr_type sub_mee::get_partition_addr(mem_fetch *mf) {
+    new_addr_type partition_addr = mf->get_addr() >> (8 + 6) << 8;
+    partition_addr |= mf->get_addr() & ((1 << 8) - 1);
+    // return partition_addr;
+    // printf("%x %x\n", mf->get_addr(), mf->get_partition_addr());
+    return mf->get_partition_addr();
+}
+
+new_addr_type sub_mee::get_sub_partition_id(mem_fetch *mf) {
+    // return (mf->get_addr() >> 8) & ((1 << 6) - 1);
+    
+    return mf->get_sub_partition_id();
+}
+
+bool sub_mee::META_queue_empty() {
+    return m_CTR_queue.empty() && m_Ciphertext_queue.empty() && m_MAC_queue.empty();
+}
+
+new_addr_type sub_mee::get_addr(new_addr_type sub_partition_id, new_addr_type partition_addr) {
+    new_addr_type new_addr = partition_addr >> 8 << (8 + 6);
+    new_addr |= partition_addr & ((1 << 8) - 1);
+    new_addr |= sub_partition_id << 8;
+    // printf("%x %x %x\n", new_addr, sub_partition_id, partition_addr);
+    return new_addr;
+}
+
+void sub_mee::gen_CTR_mf(mem_fetch *mf, bool wr) {
+    new_addr_type partition_addr = get_partition_addr(mf);
+    new_addr_type sub_partition_id = get_sub_partition_id(mf);
+    partition_addr = partition_addr >> 14 << 7;
+    new_addr_type CTR_addr  = get_addr(sub_partition_id, partition_addr);
+    CTR_addr |= CTR_mask;
+
+    meta_access(m_CTR_queue, CTR_addr, META_ACC, 
+            128, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
+}
+
+void sub_mee::gen_MAC_mf(mem_fetch *mf, bool wr) {
+    new_addr_type partition_addr = get_partition_addr(mf);
+    new_addr_type sub_partition_id = get_sub_partition_id(mf);
+    partition_addr = partition_addr >> 7 << 3;
+    new_addr_type MAC_addr  = get_addr(sub_partition_id, partition_addr);
+    MAC_addr |= MAC_mask;
+
+    meta_access(m_MAC_queue, MAC_addr, META_ACC, 
+            64, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
+}
+
+void sub_mee::meta_access(
+        std::list<mem_fetch *> &m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr,
+        unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+        mem_fetch *original_mf) const {
+
+    mem_access_byte_mask_t byte_mask;
+    mem_access_sector_mask_t sector_mask;
+    for (unsigned i = 0; i < size; i++) byte_mask.set(i);
+    for (unsigned i = 0; i < size/32; i++) sector_mask.set(i + (addr & (1 << 7) ? 2 : 0));
+
+    mem_access_t acc(type, addr, size, wr, original_mf->get_access_warp_mask(), byte_mask, sector_mask, m_gpu->gpgpu_ctx);
+    mem_fetch *mf = new mem_fetch(
+        acc, NULL /*we don't have an instruction yet*/, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
+        wid, sid, tpc, m_config, cycle, original_mf);
+    // mf->set_chip(original_mf->get_sub_partition_id)
+
+    std::vector<mem_fetch *> reqs;
+    // if (m_config->m_L2_config.m_cache_type == SECTOR)
+    reqs = m_sub_partition->breakdown_request_to_sector_requests(mf);
+    // else
+    //   reqs.push_back(mf);
+
+    for (unsigned i = 0; i < reqs.size(); ++i) {
+        mem_fetch *req = reqs[i];
+        m_META_queue.push_back(req);
+    }
+}
+
+void sub_mee::CTR_cycle() {
+    m_CTRcache->cycle();
+    if (!m_CTR_queue.empty() && !m_sub_partition->mee_dram_queue_full()) {
+        mem_fetch *mf = m_CTR_queue.front();
+        std::list<cache_event> events;
+        enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+        bool write_sent = was_write_sent(events);
+        bool read_sent = was_read_sent(events);
+        // print_addr("CTR cycle access:\t\t", mf);
+        if (status == HIT) {
+            m_CTR_queue.pop_front();
+        } else if (status != RESERVATION_FAIL) {
+            // set wating for CTR fill
+            // print_addr("CTR cycle access:\t\t", mf);
+            m_CTR_queue.pop_front();
+        } else {
+            // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
+            if (get_sub_partition_id(mf) == 0)
+                enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+            assert(!write_sent);
+            assert(!read_sent);
+        }
+    }
+};
+
+void sub_mee::MAC_cycle() {
+    m_MACcache->cycle();
+    if (!m_MAC_queue.empty() && !m_sub_partition->mee_dram_queue_full()) {
+        mem_fetch *mf = m_MAC_queue.front();
+        std::list<cache_event> events;
+        enum cache_request_status status = m_MACcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+        bool write_sent = was_write_sent(events);
+        bool read_sent = was_read_sent(events);
+        if (status == HIT) {
+            m_MAC_queue.pop_front();
+        } else if (status != RESERVATION_FAIL) {
+            // set wating for CTR fill
+            m_MAC_queue.pop_front();
+        } else {
+            assert(!write_sent);
+            assert(!read_sent);
+        }
+    }
+};
+
+void sub_mee::BMT_cycle() {
+
+};
+void sub_mee::AES_cycle() {
+
+};
+
+void sub_mee::META_fill_responses(class l2_cache *m_METAcache, const new_addr_type MASK) {
+    if (m_METAcache->access_ready()) {
+        mem_fetch *mf = m_METAcache->next_access();
+        // print_addr("fill responses:", mf);
+        // reply(m_METAcache, mf);
+        delete mf;
+    }
+}
+
+void sub_mee::META_fill(class l2_cache *m_METAcache, mem_fetch *mf, const new_addr_type MASK) {
+    if (!(mf->get_addr() & MASK) && m_METAcache->waiting_for_fill(mf)) {
+        // print_addr("wating for fill:\t\t", mf); 
+        if (m_METAcache->fill_port_free()) {
+            m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
+                                    m_memcpy_cycle_offset);
+            // print_addr("fill:\t\t\t\t", mf); 
+            m_sub_partition->dram_mee_queue_pop();
+        }
+    }
+}
+
+void sub_mee::simple_cycle(unsigned cycle) {
+    // META Cache fill responses
+    META_fill_responses(m_CTRcache, CTR_mask);
+    META_fill_responses(m_MACcache, MAC_mask);
+    // META_fill_responses(m_BMTcache);
+    // dram to mee
+    if (!m_sub_partition->dram_mee_queue_empty()) {
+        mem_fetch *mf_return = m_sub_partition->dram_mee_queue_top();
+        // print_addr("dram_mee_queue_top:\t", mf_return);
+        // mee to L2
+        META_fill(m_CTRcache, mf_return, CTR_mask);
+        META_fill(m_MACcache, mf_return, MAC_mask);
+        // META_fill(m_BMTcache, mf_return);
+        // if (!m_sub_partition->mee_L2_queue_full()) {
+
+        if (mf_return->get_access_type() == META_ACC) { // META访存的返回，需要响应
+            // printf("Success handle CTR_ACC: ");
+            // print_addr(mf_return);
+            // delete mf_return;
+        } else {    // 密文访存返回
+            // reply L2 read
+            // reply L2 write back
+            m_sub_partition->mee_L2_queue_push(mf_return);
+            m_sub_partition->dram_mee_queue_pop();
+            // print_addr(mf_return);
+        }
+        
+        // }
+    }
+    // L2 to mee
+    if (!m_sub_partition->L2_mee_queue_empty()) {
+        mem_fetch *mf = m_sub_partition->L2_mee_queue_top();
+        // mee to dram
+        if (!m_sub_partition->mee_dram_queue_full() && META_queue_empty()) {
+            if (!mf->is_write()) { // L2 read 
+                // CTR access
+                gen_CTR_mf(mf, false);
+                // Ciphertext access
+                m_Ciphertext_queue.push_back(mf);
+                // MAC access
+                gen_MAC_mf(mf, false);
+                // AES Decryption
+                AES_cycle();
+                // Hash MAC
+                
+                // MAC Check
+                // BMT Check
+            } else { // L2 write back
+                // CTR access
+                gen_CTR_mf(mf, false);
+                // CTR update
+                gen_CTR_mf(mf, true);
+                // AES Ecryption
+                // Ciphertext Update
+                // MAC access
+                gen_MAC_mf(mf, false);
+                // MAC Hash
+                // MAC Update
+                gen_MAC_mf(mf, true);
+                // BMT Update
+            }
+            
+            m_sub_partition->L2_mee_queue_pop();
+            
+        } else {
+        }
+    }
+    CTR_cycle();
+    if (!m_Ciphertext_queue.empty() && !m_sub_partition->mee_dram_queue_full() && m_CTR_queue.empty()) {
+        mem_fetch *mf = m_Ciphertext_queue.front();
+        m_sub_partition->mee_dram_queue_push(mf);
+        m_Ciphertext_queue.pop_front();
+    }
+    MAC_cycle();
+}
+
+void sub_mee::cycle(unsigned cycle) {
+}
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
new file mode 100644
index 000000000..4b12b5add
--- /dev/null
+++ b/src/gpgpu-sim/mee.h
@@ -0,0 +1,67 @@
+
+// class mem_fetch;
+// class memory_sub_partition;
+// class gpgpu_sim;
+// class new_addr_type;
+// class mem_access_type;
+// class memory_config;
+#include "mem_fetch.h"
+#include "l2cache.h"
+#include "shader.h"
+#include "gpu-sim.h"
+
+class mee {
+    public:
+
+    private:
+
+};
+
+class sub_mee {
+    public:
+        sub_mee(class memory_sub_partition *sub_partition, class l2_cache *CTRcache, class l2_cache *MACcache, class l2_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu);
+        void cycle(unsigned cycle);
+        void simple_cycle(unsigned cycle);
+        void print_addr(char s[], mem_fetch *mf);
+        void print_tag();
+        void meta_access(std::list<mem_fetch *> &m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr, unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc, mem_fetch *original_mf) const;
+        void CTR_cycle();
+        void MAC_cycle();
+        void BMT_cycle();
+        void AES_cycle();
+        new_addr_type get_partition_addr(mem_fetch *mf);
+        new_addr_type get_sub_partition_id(mem_fetch *mf);
+        new_addr_type get_addr(new_addr_type partition_id, new_addr_type partition_addr);
+
+        void gen_CTR_mf(mem_fetch *mf, bool wr);
+        void gen_MAC_mf(mem_fetch *mf, bool wr);
+        // void gen_BMT_mf(mem_fetch *mf, bool wr);
+        bool META_queue_empty();
+
+        void META_fill_responses(class l2_cache *m_METAcache, const new_addr_type MASK);
+        void META_fill(class l2_cache *m_METAcache, mem_fetch *mf, const new_addr_type MASK);
+
+        bool CTR_busy();
+        bool MAC_busy();
+        bool BMT_busy();
+        
+
+        
+    private:
+        class l2_cache *m_CTRcache;
+        class l2_cache *m_MACcache;
+        class l2_cache *m_BMTcache;
+        class memory_sub_partition *m_sub_partition;
+        const memory_config *m_config;
+        class gpgpu_sim *m_gpu;
+        std::list<mem_fetch *> m_CTR_queue;
+        std::list<mem_fetch *> m_Ciphertext_queue;
+        std::list<mem_fetch *> m_MAC_queue;
+        std::list<mem_fetch *> m_BMT_queue;
+        const new_addr_type CTR_mask = 0x10000000;
+        const new_addr_type MAC_mask = 0x00000000;
+        const int m_memcpy_cycle_offset = 0;
+        const int mee_busy_mask = 0;
+
+    
+};
\ No newline at end of file

From 5f559b16fa779cca0a373ce9495adaa2668d0cc2 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 18 Aug 2024 00:06:54 +0800
Subject: [PATCH 094/133] mee v0.2

---
 .../SM7_QV100/base/gpgpusim.config_base       |   3 +-
 src/abstract_hardware_model.h                 |   1 +
 src/gpgpu-sim/delayqueue.h                    |   1 +
 src/gpgpu-sim/gpu-cache.cc                    |  16 +
 src/gpgpu-sim/gpu-cache.h                     |   7 +-
 src/gpgpu-sim/gpu-sim.cc                      |   2 +
 src/gpgpu-sim/l2cache.cc                      | 134 +++++----
 src/gpgpu-sim/l2cache.h                       |  69 +++--
 src/gpgpu-sim/mee.cc                          | 275 +++++++++++++-----
 src/gpgpu-sim/mee.h                           |  40 +--
 10 files changed, 381 insertions(+), 167 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base b/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
index a4535ba80..a40bdc3e4 100644
--- a/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
+++ b/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
@@ -163,7 +163,8 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dmeta S:16:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dmeta N:16:128:16,L:B:m:W:P,A:192:4,32:0,32
+# -gpgpu_cache:dmeta S:16:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index dee38e86f..509fdd4b1 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -768,6 +768,7 @@ typedef std::bitset<SECTOR_CHUNCK_SIZE> mem_access_sector_mask_t;
       MA_TUP(TEXTURE_ACC_R), MA_TUP(GLOBAL_ACC_W), MA_TUP(LOCAL_ACC_W), \
       MA_TUP(L1_WRBK_ACC), MA_TUP(L2_WRBK_ACC), MA_TUP(INST_ACC_R),     \
       MA_TUP(L1_WR_ALLOC_R), MA_TUP(L2_WR_ALLOC_R), MA_TUP(META_ACC),   \
+      MA_TUP(META_RBW),\
       MA_TUP(NUM_MEM_ACCESS_TYPE) MA_TUP_END(mem_access_type)
 
 #define MA_TUP_BEGIN(X) enum X {
diff --git a/src/gpgpu-sim/delayqueue.h b/src/gpgpu-sim/delayqueue.h
index 1cf418529..4cfbc98a3 100644
--- a/src/gpgpu-sim/delayqueue.h
+++ b/src/gpgpu-sim/delayqueue.h
@@ -154,6 +154,7 @@ class fifo_pipeline {
   }
 
   bool full() const { return (m_max_len && m_length >= m_max_len); }
+  bool full(int n) const { return (m_max_len && m_length + n >= m_max_len); }
   bool is_avilable_size(unsigned size) const {
     return (m_max_len && m_length + size - 1 >= m_max_len);
   }
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index a2aeec57f..373fa4b5f 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1700,6 +1700,18 @@ enum cache_request_status read_only_cache::access(
 //! A general function that takes the result of a tag_array probe
 //  and performs the correspding functions based on the cache configuration
 //  The access fucntion calls this function
+
+enum cache_request_status data_cache::probe(new_addr_type addr, mem_fetch *mf) const {
+  assert(mf->get_data_size() <= m_config.get_atom_sz());
+  bool wr = mf->get_is_write();
+  new_addr_type block_addr = m_config.block_addr(addr);
+  unsigned cache_index = (unsigned)-1;
+  enum cache_request_status probe_status =
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);
+  
+  return probe_status;
+}
+
 enum cache_request_status data_cache::process_tag_probe(
     bool wr, enum cache_request_status probe_status, new_addr_type addr,
     unsigned cache_index, mem_fetch *mf, unsigned time,
@@ -1783,6 +1795,10 @@ enum cache_request_status l2_cache::access(new_addr_type addr, mem_fetch *mf,
   return data_cache::access(addr, mf, time, events);
 }
 
+enum cache_request_status l2_cache::probe(new_addr_type addr, mem_fetch *mf) const {
+  return data_cache::probe(addr, mf);
+}
+
 /// Access function for tex_cache
 /// return values: RESERVATION_FAIL if request could not be accepted
 /// otherwise returns HIT_RESERVED or MISS; NOTE: *never* returns HIT
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 2688f5676..f5011c956 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -885,6 +885,7 @@ class cache_config {
   friend class l1_cache;
   friend class l2_cache;
   friend class memory_sub_partition;
+  friend class mee;
 };
 
 class l1d_cache_config : public cache_config {
@@ -1005,7 +1006,7 @@ class tag_array {
 
   friend class basline_cache;
   friend class l2_cache;
-  friend class sub_mee;
+  friend class mee;
 };
 
 class mshr_table {
@@ -1443,7 +1444,7 @@ class baseline_cache : public cache_t {
 
   friend class l2_cache;
   friend class data_cache;
-  friend class sub_mee;
+  friend class mee;
 };
 
 /// Read only cache
@@ -1571,6 +1572,7 @@ class data_cache : public baseline_cache {
                                               unsigned cache_index,
                                               mem_fetch *mf, unsigned time,
                                               std::list<cache_event> &events);
+  enum cache_request_status probe(new_addr_type addr, mem_fetch *mf) const;
 
  protected:
   mem_fetch_allocator *m_memfetch_creator;
@@ -1700,6 +1702,7 @@ class l2_cache : public data_cache {
   virtual enum cache_request_status access(new_addr_type addr, mem_fetch *mf,
                                            unsigned time,
                                            std::list<cache_event> &events);
+  virtual enum cache_request_status probe(new_addr_type addr, mem_fetch *mf) const;
 };
 
 /*****************************************************************************/
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 8369f6e61..c4000c148 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1938,6 +1938,8 @@ void gpgpu_sim::cycle() {
         m_memory_sub_partition[i]->push(mf, gpu_sim_cycle + gpu_tot_sim_cycle);
         if (mf) partiton_reqs_in_parallel_per_cycle++;
       }
+      if (i & 1)
+        m_memory_partition_unit[i >> 1]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle);
       m_memory_sub_partition[i]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle);
       m_memory_sub_partition[i]->accumulate_L2cache_stats(
           m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 06f5a662f..f9885a18c 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -82,6 +82,34 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
       m_gpu(gpu) {
   m_dram = new dram_t(m_id, m_config, m_stats, this, gpu);
 
+  char L2c_name[32];
+  snprintf(L2c_name, 32, "L2_bank_%03d", m_id);
+  m_metainterface = new metainterface(this);
+  m_mf_allocator = new partition_mf_allocator(config);
+
+  if (!m_config->m_L2_config.disabled()) {
+    m_CTRcache =
+        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_MACcache =
+        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_BMTcache =
+        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+  }
+
+  unsigned int icnt_L2;
+  unsigned int L2_dram;
+  unsigned int dram_L2;
+  unsigned int L2_icnt;
+  sscanf(m_config->gpgpu_L2_queue_config, "%u:%u:%u:%u", &icnt_L2, &L2_dram,
+         &dram_L2, &L2_icnt);
+  m_mee_dram_queue = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
+  m_dram_mee_queue = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, dram_L2);
+
+  m_mee = new mee(this, m_CTRcache, m_MACcache, m_BMTcache, m_config, m_gpu);
+  
   m_sub_partition = new memory_sub_partition
       *[m_config->m_n_sub_partition_per_memory_channel];
   for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
@@ -108,6 +136,9 @@ void memory_partition_unit::handle_memcpy_to_gpu(
 
 memory_partition_unit::~memory_partition_unit() {
   delete m_dram;
+  delete m_CTRcache;
+  delete m_metainterface;
+  delete m_mee;
   for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
        p++) {
     delete m_sub_partition[p];
@@ -197,10 +228,12 @@ bool memory_partition_unit::busy() const {
 }
 
 void memory_partition_unit::cache_cycle(unsigned cycle) {
-  for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
-       p++) {
-    m_sub_partition[p]->cache_cycle(cycle);
-  }
+  // for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
+  //      p++) {
+  //   m_sub_partition[p]->cache_cycle(cycle);
+  // }
+  // printf("memory_partition_unit cycle: %d\n", cycle);
+  m_mee->simple_cycle(cycle);
 }
 
 void memory_partition_unit::visualizer_print(gzFile visualizer_file) const {
@@ -214,7 +247,7 @@ void memory_partition_unit::visualizer_print(gzFile visualizer_file) const {
 // determine whether a given subpartition can issue to DRAM
 bool memory_partition_unit::can_issue_to_dram(int inner_sub_partition_id) {
   int spid = inner_sub_partition_id;
-  bool sub_partition_contention = m_sub_partition[spid]->dram_mee_queue_full();
+  bool sub_partition_contention = dram_mee_queue_full();
   bool has_dram_resource = m_arbitration_metadata.has_credits(spid);
 
   MEMPART_DPRINTF(
@@ -245,12 +278,12 @@ void memory_partition_unit::simple_dram_model_cycle() {
       unsigned dest_global_spid = mf_return->get_sub_partition_id();
       int dest_spid = global_sub_partition_id_to_local_id(dest_global_spid);
       assert(m_sub_partition[dest_spid]->get_id() == dest_global_spid);
-      if (!m_sub_partition[dest_spid]->dram_mee_queue_full()) {
+      if (!dram_mee_queue_full()) {
         if (mf_return->get_access_type() == L1_WRBK_ACC) {
           m_sub_partition[dest_spid]->set_done(mf_return);
           delete mf_return;
         } else {
-          m_sub_partition[dest_spid]->dram_mee_queue_push(mf_return);
+          dram_mee_queue_push(mf_return);
           mf_return->set_status(
               IN_PARTITION_DRAM_TO_L2_QUEUE,
               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
@@ -309,12 +342,12 @@ void memory_partition_unit::dram_cycle() {
     unsigned dest_global_spid = mf_return->get_sub_partition_id();
     int dest_spid = global_sub_partition_id_to_local_id(dest_global_spid);
     assert(m_sub_partition[dest_spid]->get_id() == dest_global_spid);
-    if (!m_sub_partition[dest_spid]->dram_mee_queue_full()) {
+    if (!dram_mee_queue_full()) {
       if (mf_return->get_access_type() == L1_WRBK_ACC) {
         m_sub_partition[dest_spid]->set_done(mf_return);
         delete mf_return;
       } else {
-        m_sub_partition[dest_spid]->dram_mee_queue_push(mf_return);
+        dram_mee_queue_push(mf_return);
         mf_return->set_status(IN_PARTITION_DRAM_TO_L2_QUEUE,
                               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
         m_arbitration_metadata.return_credit(dest_spid);
@@ -340,12 +373,15 @@ void memory_partition_unit::dram_cycle() {
        p++) {
     int spid = (p + last_issued_partition + 1) %
                m_config->m_n_sub_partition_per_memory_channel;
-    if (!m_sub_partition[spid]->mee_dram_queue_empty() &&
+    if (!mee_dram_queue_empty() &&
         can_issue_to_dram(spid)) {
-      mem_fetch *mf = m_sub_partition[spid]->mee_dram_queue_top();
+      mem_fetch *mf = mee_dram_queue_top();
+
+      if (global_sub_partition_id_to_local_id(mf->get_sub_partition_id()) != spid) continue;
+
       if (m_dram->full(mf->is_write())) break;
 
-      m_sub_partition[spid]->mee_dram_queue_pop();
+      mee_dram_queue_pop();
       MEMPART_DPRINTF(
           "Issue mem_fetch request %p from sub partition %d to dram\n", mf,
           spid);
@@ -435,25 +471,25 @@ memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
   char L2c_name[32];
   snprintf(L2c_name, 32, "L2_bank_%03d", m_id);
   m_L2interface = new L2interface(this);
-  m_metainterface = new metainterface(this);
+  // m_metainterface = new metainterface(this);
   m_mf_allocator = new partition_mf_allocator(config);
 
   if (!m_config->m_L2_config.disabled()) {
     m_L2cache =
         new l2_cache(L2c_name, m_config->m_L2_config, -1, -1, m_L2interface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
-    m_CTRcache =
-        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
-                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
-    m_MACcache =
-        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
-                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
-    m_BMTcache =
-        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
-                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    // m_CTRcache =
+    //     new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+    //                  m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    // m_MACcache =
+    //     new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+    //                  m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    // m_BMTcache =
+    //     new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+    //                  m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
   }
 
-  m_sub_mee = new sub_mee(this, m_CTRcache, m_MACcache, m_BMTcache, m_config, m_gpu);
+  // m_sub_mee = new sub_mee(this, m_CTRcache, m_MACcache, m_BMTcache, m_config, m_gpu);
 
   unsigned int icnt_L2;
   unsigned int L2_dram;
@@ -463,8 +499,8 @@ memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
          &dram_L2, &L2_icnt);
   m_icnt_L2_queue = new fifo_pipeline<mem_fetch>("icnt-to-L2", 0, icnt_L2);
   m_L2_mee_queue = new fifo_pipeline<mem_fetch>("L2-to-mee", 0, L2_dram);
-  m_mee_dram_queue = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
-  m_dram_mee_queue = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, dram_L2);
+  // m_mee_dram_queue = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
+  // m_dram_mee_queue = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, dram_L2);
   m_mee_L2_queue = new fifo_pipeline<mem_fetch>("mee-to-L2", 0, dram_L2);
   m_L2_icnt_queue = new fifo_pipeline<mem_fetch>("L2-to-icnt", 0, L2_icnt);
   wb_addr = -1;
@@ -476,13 +512,11 @@ memory_sub_partition::~memory_sub_partition() {
   delete m_mee_L2_queue;
   delete m_L2_icnt_queue;
   delete m_L2cache;
-  delete m_CTRcache;
   delete m_L2interface;
-  delete m_metainterface;
-  delete m_sub_mee;
 }
 
 void memory_sub_partition::cache_cycle(unsigned cycle) {
+  // printf("memory_partition_unit cycle: %d\n", cycle);
   // L2 fill responses
   if (!m_config->m_L2_config.disabled()) {
     if (m_L2cache->access_ready() && !m_L2_icnt_queue->full()) {
@@ -530,7 +564,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
     }
   }
 
-  m_sub_mee->simple_cycle(cycle);
+  // m_mee->simple_cycle(cycle);
   
   // prior L2 misses inserted into m_L2_mee_queue here
   if (!m_config->m_L2_config.disabled()) m_L2cache->cycle();
@@ -626,64 +660,64 @@ bool memory_sub_partition::full(unsigned size) const {
 
 // interface to L2_mee_queue
 
-bool memory_sub_partition::L2_mee_queue_empty() const {
-  return m_L2_mee_queue->empty(); // TODO
+bool memory_partition_unit::L2_mee_queue_empty(unsigned spid) const {
+  return m_sub_partition[spid]->m_L2_mee_queue->empty(); // TODO
 }
 
-class mem_fetch *memory_sub_partition::L2_mee_queue_top() const {
-  return m_L2_mee_queue->top(); // TODO
+class mem_fetch *memory_partition_unit::L2_mee_queue_top(unsigned spid) const {
+  return m_sub_partition[spid]->m_L2_mee_queue->top(); // TODO
 }
 
-void memory_sub_partition::L2_mee_queue_pop() { m_L2_mee_queue->pop(); } // TODO
+void memory_partition_unit::L2_mee_queue_pop(unsigned spid) { m_sub_partition[spid]->m_L2_mee_queue->pop(); } // TODO
 
 // interface to mee_dram_queue
 
-bool memory_sub_partition::mee_dram_queue_empty() const {
+bool memory_partition_unit::mee_dram_queue_empty() const {
   return m_mee_dram_queue->empty(); // TODO
 }
 
-class mem_fetch *memory_sub_partition::mee_dram_queue_top() const {
+class mem_fetch *memory_partition_unit::mee_dram_queue_top() const {
   return m_mee_dram_queue->top(); // TODO
 }
 
-void memory_sub_partition::mee_dram_queue_pop() { m_mee_dram_queue->pop(); } // TODO
+void memory_partition_unit::mee_dram_queue_pop() { m_mee_dram_queue->pop(); } // TODO
 
-bool memory_sub_partition::mee_dram_queue_full() const {
+bool memory_partition_unit::mee_dram_queue_full() const {
   return m_mee_dram_queue->full(); //TODO
 }
 
-void memory_sub_partition::mee_dram_queue_push(class mem_fetch *mf) {
+void memory_partition_unit::mee_dram_queue_push(class mem_fetch *mf) {
   m_mee_dram_queue->push(mf); //TODO
 }
 
 // interface to dram_mee_queue
 
-bool memory_sub_partition::dram_mee_queue_empty() const {
+bool memory_partition_unit::dram_mee_queue_empty() const {
   return m_dram_mee_queue->empty(); // TODO
 }
 
-class mem_fetch *memory_sub_partition::dram_mee_queue_top() const {
+class mem_fetch *memory_partition_unit::dram_mee_queue_top() const {
   return m_dram_mee_queue->top(); // TODO
 }
 
-void memory_sub_partition::dram_mee_queue_pop() { m_dram_mee_queue->pop(); } // TODO
+void memory_partition_unit::dram_mee_queue_pop() { m_dram_mee_queue->pop(); } // TODO
 
-bool memory_sub_partition::dram_mee_queue_full() const {
+bool memory_partition_unit::dram_mee_queue_full() const {
   return m_dram_mee_queue->full(); //TODO
 }
 
-void memory_sub_partition::dram_mee_queue_push(class mem_fetch *mf) {
+void memory_partition_unit::dram_mee_queue_push(class mem_fetch *mf) {
   m_dram_mee_queue->push(mf); //TODO
 }
 
 // interface to mee_L2_queue
 
-bool memory_sub_partition::mee_L2_queue_full() const {
-  return m_mee_L2_queue->full(); //TODO
+bool memory_partition_unit::mee_L2_queue_full(unsigned spid) const {
+  return m_sub_partition[spid]->m_mee_L2_queue->full(); //TODO
 }
 
-void memory_sub_partition::mee_L2_queue_push(class mem_fetch *mf) {
-  m_mee_L2_queue->push(mf); //TODO
+void memory_partition_unit::mee_L2_queue_push(unsigned spid, class mem_fetch *mf) {
+  m_sub_partition[spid]->m_mee_L2_queue->push(mf); //TODO
 }
 
 void memory_sub_partition::print_cache_stat(unsigned &accesses,
@@ -767,7 +801,7 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
 unsigned memory_sub_partition::flushL2() {
   if (!m_config->m_L2_config.disabled()) {
     m_L2cache->flush();//TODO
-    m_CTRcache->flush();
+    // m_CTRcache->flush();
   }
   return 0;  // TODO: write the flushed data to the main memory
 }
@@ -775,7 +809,7 @@ unsigned memory_sub_partition::flushL2() {
 unsigned memory_sub_partition::invalidateL2() {
   if (!m_config->m_L2_config.disabled()) {
     m_L2cache->invalidate();//TODO
-    m_CTRcache->invalidate();
+    // m_CTRcache->invalidate();
   }
   return 0;
 }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 4d640c995..8226c18a8 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -108,13 +108,44 @@ class memory_partition_unit {
     return m_gpu;
   }
 
+  bool L2_mee_queue_empty(unsigned spid) const;
+  class mem_fetch *L2_mee_queue_top(unsigned spid) const;
+  void L2_mee_queue_pop(unsigned spid);
+
+  bool mee_dram_queue_empty() const;
+  class mem_fetch *mee_dram_queue_top() const;
+  void mee_dram_queue_pop();
+  bool mee_dram_queue_full() const;
+  void mee_dram_queue_push(class mem_fetch *mf);
+
+  bool dram_mee_queue_empty() const;
+  class mem_fetch *dram_mee_queue_top() const;
+  void dram_mee_queue_pop();
+  bool dram_mee_queue_full() const;
+  void dram_mee_queue_push(class mem_fetch *mf);
+
+  void mee_L2_queue_push(unsigned spid, class mem_fetch *mf);
+  bool mee_L2_queue_full(unsigned spid) const;
+
+  class memory_sub_partition **m_sub_partition;
+  
  private:
   unsigned m_id;
   const memory_config *m_config;
   class memory_stats_t *m_stats;
-  class memory_sub_partition **m_sub_partition;
+  // class memory_sub_partition **m_sub_partition;
   class dram_t *m_dram;
 
+  class l2_cache *m_CTRcache;
+  class l2_cache *m_MACcache;
+  class l2_cache *m_BMTcache;
+  class mee *m_mee;
+  class metainterface *m_metainterface;
+  partition_mf_allocator *m_mf_allocator;
+
+  fifo_pipeline<mem_fetch> *m_mee_dram_queue; 
+  fifo_pipeline<mem_fetch> *m_dram_mee_queue; 
+
   class arbitration_metadata {
    public:
     arbitration_metadata(const memory_config *config);
@@ -156,7 +187,7 @@ class memory_partition_unit {
 
   class gpgpu_sim *m_gpu;
 
-  friend class sub_mee;
+  friend class mee;
 };
 
 class memory_sub_partition {
@@ -224,6 +255,14 @@ class memory_sub_partition {
   }
   // class l2_cache *m_CTRcache;
   std::vector<mem_fetch *> breakdown_request_to_sector_requests(mem_fetch *mf);
+
+  // these are various FIFOs between units within a memory partition
+  fifo_pipeline<mem_fetch> *m_icnt_L2_queue;
+  fifo_pipeline<mem_fetch> *m_L2_mee_queue;
+  // fifo_pipeline<mem_fetch> *m_mee_dram_queue; 
+  // fifo_pipeline<mem_fetch> *m_dram_mee_queue; 
+  fifo_pipeline<mem_fetch> *m_mee_L2_queue;
+  fifo_pipeline<mem_fetch> *m_L2_icnt_queue;  // L2 cache hit response queue
   
  private:
   // data
@@ -231,11 +270,11 @@ class memory_sub_partition {
   const memory_config *m_config;
   class l2_cache *m_L2cache;
   class L2interface *m_L2interface;
-  class l2_cache *m_CTRcache;
-  class l2_cache *m_MACcache;
-  class l2_cache *m_BMTcache;
-  class sub_mee *m_sub_mee;
-  class metainterface *m_metainterface;
+  // class l2_cache *m_CTRcache;
+  // class l2_cache *m_MACcache;
+  // class l2_cache *m_BMTcache;
+  // class mee *m_mee;
+  // class metainterface *m_metainterface;
   class gpgpu_sim *m_gpu;
   partition_mf_allocator *m_mf_allocator;
 
@@ -246,14 +285,6 @@ class memory_sub_partition {
   };
   std::queue<rop_delay_t> m_rop;
 
-  // these are various FIFOs between units within a memory partition
-  fifo_pipeline<mem_fetch> *m_icnt_L2_queue;
-  fifo_pipeline<mem_fetch> *m_L2_mee_queue;
-  fifo_pipeline<mem_fetch> *m_mee_dram_queue; 
-  fifo_pipeline<mem_fetch> *m_dram_mee_queue; 
-  fifo_pipeline<mem_fetch> *m_mee_L2_queue;
-  fifo_pipeline<mem_fetch> *m_L2_icnt_queue;  // L2 cache hit response queue
-
   class mem_fetch *L2dramout;
   unsigned long long int wb_addr;
 
@@ -295,19 +326,19 @@ class L2interface : public mem_fetch_interface {
 
 class metainterface : public mem_fetch_interface {
  public:
-  metainterface(memory_sub_partition *unit) { m_unit = unit; }
+  metainterface(memory_partition_unit *unit) { m_unit = unit; }
   virtual ~metainterface() {}
   virtual bool full(unsigned size, bool write) const {
     // assume read and write packets all same size
-    return m_unit->m_mee_dram_queue->full();
+    return m_unit->mee_dram_queue_full();
   }
   virtual void push(mem_fetch *mf) {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
-    m_unit->m_mee_dram_queue->push(mf);
+    m_unit->mee_dram_queue_push(mf);
   }
 
  private:
-  memory_sub_partition *m_unit;
+  memory_partition_unit *m_unit;
 };
 
 #endif
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 56182216e..7b322530f 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -1,26 +1,37 @@
 #include "mee.h"
 #include <list>
 
-sub_mee::sub_mee(class memory_sub_partition *sub_partition, class l2_cache *CTRcache, class l2_cache *MACcache, class l2_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
-    m_sub_partition(sub_partition), 
+mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_cache *MACcache, class l2_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
+    m_unit(unit), 
     m_CTRcache(CTRcache),
     m_MACcache(MACcache),
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
+    m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
 
+    m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+
+    m_OTP_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 64);
+    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
 }
 int decode(int addr) {
     return (addr & 16128) >> 8;
 }
-void sub_mee::print_addr(char s[], mem_fetch *mf) {
+void mee::print_addr(char s[], mem_fetch *mf) {
     if (mf->get_sub_partition_id() == 0) {
-        printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\tmask_id: %d\tmask_addr:%x\taccess type:%d\n", s, mf->get_addr(), mf->get_sub_partition_id(), mf->get_partition_addr(), get_sub_partition_id(mf), get_partition_addr(mf), mf->get_access_type());
+        printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\tmask_id: %d\tmask_addr:%x\taccess type:%d\n", s, mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), get_sub_partition_id(mf), get_partition_addr(mf), mf->get_access_type());
         // print_tag();
     }
 }
 
-void sub_mee::print_tag() {
+void mee::print_tag() {
     // if (get_sub_partition_id(mf) == 0) {
         // for (unsigned i = 0; i < m_config->m_META_config.get_num_lines(); i++) {
         for (unsigned i = 188; i < 192; i++) {
@@ -33,7 +44,7 @@ void sub_mee::print_tag() {
     // }
 }
 
-new_addr_type sub_mee::get_partition_addr(mem_fetch *mf) {
+new_addr_type mee::get_partition_addr(mem_fetch *mf) {
     new_addr_type partition_addr = mf->get_addr() >> (8 + 6) << 8;
     partition_addr |= mf->get_addr() & ((1 << 8) - 1);
     // return partition_addr;
@@ -41,17 +52,17 @@ new_addr_type sub_mee::get_partition_addr(mem_fetch *mf) {
     return mf->get_partition_addr();
 }
 
-new_addr_type sub_mee::get_sub_partition_id(mem_fetch *mf) {
+new_addr_type mee::get_sub_partition_id(mem_fetch *mf) {
     // return (mf->get_addr() >> 8) & ((1 << 6) - 1);
     
     return mf->get_sub_partition_id();
 }
 
-bool sub_mee::META_queue_empty() {
-    return m_CTR_queue.empty() && m_Ciphertext_queue.empty() && m_MAC_queue.empty();
+bool mee::META_queue_empty() {
+    return m_CTR_queue->empty() && m_Ciphertext_queue->empty() && m_MAC_queue->empty();
 }
 
-new_addr_type sub_mee::get_addr(new_addr_type sub_partition_id, new_addr_type partition_addr) {
+new_addr_type mee::get_addr(new_addr_type sub_partition_id, new_addr_type partition_addr) {
     new_addr_type new_addr = partition_addr >> 8 << (8 + 6);
     new_addr |= partition_addr & ((1 << 8) - 1);
     new_addr |= sub_partition_id << 8;
@@ -59,19 +70,19 @@ new_addr_type sub_mee::get_addr(new_addr_type sub_partition_id, new_addr_type pa
     return new_addr;
 }
 
-void sub_mee::gen_CTR_mf(mem_fetch *mf, bool wr) {
+void mee::gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
-    partition_addr = partition_addr >> 14 << 7;
+    partition_addr = partition_addr >> 18 << 7;
     new_addr_type CTR_addr  = get_addr(sub_partition_id, partition_addr);
     CTR_addr |= CTR_mask;
 
-    meta_access(m_CTR_queue, CTR_addr, META_ACC, 
+    meta_access(m_CTR_queue, CTR_addr, meta_acc, 
             128, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
             mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
 }
 
-void sub_mee::gen_MAC_mf(mem_fetch *mf, bool wr) {
+void mee::gen_MAC_mf(mem_fetch *mf, bool wr) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
     partition_addr = partition_addr >> 7 << 3;
@@ -79,12 +90,12 @@ void sub_mee::gen_MAC_mf(mem_fetch *mf, bool wr) {
     MAC_addr |= MAC_mask;
 
     meta_access(m_MAC_queue, MAC_addr, META_ACC, 
-            64, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            68, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
             mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
 }
 
-void sub_mee::meta_access(
-        std::list<mem_fetch *> &m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr,
+void mee::meta_access(
+        fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr,
         unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
         mem_fetch *original_mf) const {
 
@@ -100,55 +111,155 @@ void sub_mee::meta_access(
     // mf->set_chip(original_mf->get_sub_partition_id)
 
     std::vector<mem_fetch *> reqs;
-    // if (m_config->m_L2_config.m_cache_type == SECTOR)
-    reqs = m_sub_partition->breakdown_request_to_sector_requests(mf);
-    // else
-    //   reqs.push_back(mf);
+    if (m_config->m_META_config.m_cache_type == SECTOR)
+        reqs = m_unit->m_sub_partition[0]->breakdown_request_to_sector_requests(mf);
+    else
+        reqs.push_back(mf);
 
     for (unsigned i = 0; i < reqs.size(); ++i) {
         mem_fetch *req = reqs[i];
-        m_META_queue.push_back(req);
+        m_META_queue->push(req);
+    }
+}
+
+void mee::CT_cycle() {
+    if (!m_Ciphertext_RET_queue->empty()) {
+        mem_fetch *mf_return = m_Ciphertext_RET_queue->top();
+        int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+        if (mf_return->is_write() && !m_unit->mee_L2_queue_full(spid)) { // write
+            // print_addr("mee to L2 W:\t", mf_return);
+            m_unit->mee_L2_queue_push(spid, mf_return);
+            m_Ciphertext_RET_queue->pop();
+        } else if (!m_AES_queue->full()) {              // read
+            m_AES_queue->push(mf_return);
+            m_Ciphertext_RET_queue->pop();
+        }
+    }
+
+    if (!m_Ciphertext_queue->empty()) {
+        mem_fetch *mf = m_Ciphertext_queue->top();
+        if (mf->is_write() && !m_AES_queue->full()) { // write
+            m_AES_queue->push(mf);
+            m_Ciphertext_queue->pop();
+        } else if (!m_unit->mee_dram_queue_full()) {              // read
+            m_unit->mee_dram_queue_push(mf);
+            m_Ciphertext_queue->pop();
+        }
+    }
+}
+
+void mee::AES_cycle() {
+    if (!m_AES_queue->empty()) {
+        mem_fetch *mf = m_AES_queue->top();
+        new_addr_type REQ_addr = (new_addr_type) mf;
+        new_addr_type OTP_addr = m_OTP_table[REQ_addr];
+        int spid = m_unit->global_sub_partition_id_to_local_id(mf->get_sub_partition_id());
+        // if (mf->get_sub_partition_id() == 0) 
+        //     printf("%x\n", OTP_addr);
+        if (m_OTP_set[OTP_addr]) {
+            if (mf->is_write()) {
+                if (!m_unit->mee_dram_queue_full()) {
+                    m_OTP_set[OTP_addr]--;
+                    m_OTP_table[REQ_addr] = 0;
+                    m_unit->mee_dram_queue_push(mf);
+                    m_AES_queue->pop();
+                }
+            } else if (!m_unit->mee_L2_queue_full(spid)) {
+                m_OTP_set[OTP_addr]--;
+                m_OTP_table[REQ_addr] = 0;
+                // print_addr("mee to L2 R:\t", mf);
+                m_unit->mee_L2_queue_push(spid, mf);
+                m_AES_queue->pop();
+                
+            }
+        }
+    }
+
+    if (!m_OTP_queue->empty()){
+        mem_fetch *mf = m_OTP_queue->top();
+        if (mf) {
+            m_OTP_set[(new_addr_type)mf]++;
+        }
+        delete mf;
+        m_OTP_queue->pop();
     }
 }
 
-void sub_mee::CTR_cycle() {
+void mee::CTR_cycle() {
+    if (!m_CTR_RET_queue->empty()) {
+        mem_fetch *mf_return = m_CTR_RET_queue->top();
+        if (mf_return->get_type() == META_RBW) {
+            m_CTR_RET_queue->pop();
+            gen_CTR_mf(mf_return->get_original_mf(), META_ACC, true);
+            delete mf_return;
+        } else {
+                // print_addr("MISS OTP:\t\t", mf_return);
+            if (!m_OTP_queue->full()) {
+                m_OTP_queue->push(mf_return);
+                m_CTR_RET_queue->pop();
+            }
+        }
+    }
+
     m_CTRcache->cycle();
-    if (!m_CTR_queue.empty() && !m_sub_partition->mee_dram_queue_full()) {
-        mem_fetch *mf = m_CTR_queue.front();
+    
+    if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full() && !m_OTP_queue->full()) {
+        mem_fetch *mf = m_CTR_queue->top();
+        // print_addr("CTR cycle access:\t\t", mf);
+
+        if (mf->is_write()) {
+            if (m_CTRcache->probe(mf->get_addr(), mf) != HIT) {
+                return;
+            }
+        }
+
+        if (mf->get_type() != META_RBW) {
+            m_OTP_table[(new_addr_type)mf->get_original_mf()] = (new_addr_type)mf;
+        }
+
         std::list<cache_event> events;
         enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
         // print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
-            m_CTR_queue.pop_front();
+            // if (!m_OTP_queue->full()) {
+            // print_addr("HIT OTP:\t\t", mf);
+            m_OTP_queue->push(mf);
+            m_CTR_queue->pop();
+            // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
             // print_addr("CTR cycle access:\t\t", mf);
-            m_CTR_queue.pop_front();
+            m_CTR_queue->pop();
         } else {
             // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
-            if (get_sub_partition_id(mf) == 0)
-                enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+            // if (get_sub_partition_id(mf) == 0)
+            //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             assert(!write_sent);
             assert(!read_sent);
         }
-    }
+    } 
+    // else if (mf->get_sub_partition_id() == 1) {
+        // if (m_unit->mee_dram_queue_full()) printf("AAAAAAAAAAAAAA\n");
+        // if (m_OTP_queue->full()) printf("BBBBBBBBBBBBBBBBB\n");
+        // if (!m_OTP_queue->empty() && m_CTR_queue->empty()) printf("CCCCCCCCCCCCCCCCCCCCCCCC\n");
+    //}
 };
 
-void sub_mee::MAC_cycle() {
+void mee::MAC_cycle() {
     m_MACcache->cycle();
-    if (!m_MAC_queue.empty() && !m_sub_partition->mee_dram_queue_full()) {
-        mem_fetch *mf = m_MAC_queue.front();
+    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full()) {
+        mem_fetch *mf = m_MAC_queue->top();
         std::list<cache_event> events;
         enum cache_request_status status = m_MACcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
         if (status == HIT) {
-            m_MAC_queue.pop_front();
+            m_MAC_queue->pop();
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
-            m_MAC_queue.pop_front();
+            m_MAC_queue->pop();
         } else {
             assert(!write_sent);
             assert(!read_sent);
@@ -156,109 +267,119 @@ void sub_mee::MAC_cycle() {
     }
 };
 
-void sub_mee::BMT_cycle() {
+void mee::BMT_cycle() {
 
 };
-void sub_mee::AES_cycle() {
 
-};
-
-void sub_mee::META_fill_responses(class l2_cache *m_METAcache, const new_addr_type MASK) {
+void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK) {
     if (m_METAcache->access_ready()) {
         mem_fetch *mf = m_METAcache->next_access();
+        m_META_RET_queue->push(mf);
         // print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
-        delete mf;
+        // delete mf;
     }
 }
 
-void sub_mee::META_fill(class l2_cache *m_METAcache, mem_fetch *mf, const new_addr_type MASK) {
-    if (!(mf->get_addr() & MASK) && m_METAcache->waiting_for_fill(mf)) {
+void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK) {
+    if ((mf->get_addr() & MASK) && m_METAcache->waiting_for_fill(mf)) {
         // print_addr("wating for fill:\t\t", mf); 
-        if (m_METAcache->fill_port_free()) {
+        if (m_METAcache->fill_port_free() && !m_META_RET_queue->full()) {
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                     m_memcpy_cycle_offset);
             // print_addr("fill:\t\t\t\t", mf); 
-            m_sub_partition->dram_mee_queue_pop();
+            // if (mf->get_sub_partition_id() == 1) { 
+            //     printf("CTR Fill: %p\n", mf);
+            //     // printf("CTR Next: %p\n", m_CTR_queue->top());
+            // }
+            m_unit->dram_mee_queue_pop();
         }
     }
 }
 
-void sub_mee::simple_cycle(unsigned cycle) {
+void mee::simple_cycle(unsigned cycle) {
     // META Cache fill responses
-    META_fill_responses(m_CTRcache, CTR_mask);
-    META_fill_responses(m_MACcache, MAC_mask);
+    META_fill_responses(m_CTRcache, m_CTR_RET_queue, CTR_mask);
+    // META_fill_responses(m_MACcache, MAC_mask);
     // META_fill_responses(m_BMTcache);
     // dram to mee
-    if (!m_sub_partition->dram_mee_queue_empty()) {
-        mem_fetch *mf_return = m_sub_partition->dram_mee_queue_top();
+    if (!m_unit->dram_mee_queue_empty()) {
+        mem_fetch *mf_return = m_unit->dram_mee_queue_top();
         // print_addr("dram_mee_queue_top:\t", mf_return);
         // mee to L2
-        META_fill(m_CTRcache, mf_return, CTR_mask);
-        META_fill(m_MACcache, mf_return, MAC_mask);
+        
+        // META_fill(m_MACcache, mf_return, MAC_mask);
         // META_fill(m_BMTcache, mf_return);
-        // if (!m_sub_partition->mee_L2_queue_full()) {
+        // if (!m_unit->mee_L2_queue_full()) {
 
-        if (mf_return->get_access_type() == META_ACC) { // META访存的返回，需要响应
+        if (mf_return->get_access_type() >= META_ACC) { // META访存的返回，需要响应
             // printf("Success handle CTR_ACC: ");
-            // print_addr(mf_return);
+            // print_addr("META return to mee", mf_return);
             // delete mf_return;
+            META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask);
         } else {    // 密文访存返回
             // reply L2 read
             // reply L2 write back
-            m_sub_partition->mee_L2_queue_push(mf_return);
-            m_sub_partition->dram_mee_queue_pop();
-            // print_addr(mf_return);
+            //m_unit->mee_L2_queue_push(m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id()), mf_return);
+            if (!m_Ciphertext_RET_queue->full()) {
+                m_Ciphertext_RET_queue->push(mf_return);
+                m_unit->dram_mee_queue_pop();
+            }
+            // print_addr("mee to L2: ", mf_return);
         }
         
         // }
     }
+    // printf("L2 to mee queue: %d %d\n", m_unit->m_sub_partition[0]->m_L2_mee_queue->empty(), m_unit->m_sub_partition[0]->m_L2_mee_queue->empty());
     // L2 to mee
-    if (!m_sub_partition->L2_mee_queue_empty()) {
-        mem_fetch *mf = m_sub_partition->L2_mee_queue_top();
+    if (!m_unit->L2_mee_queue_empty(cycle&1)) {
+        mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
+        // print_addr("L2 to mee: ", mf);
         // mee to dram
-        if (!m_sub_partition->mee_dram_queue_full() && META_queue_empty()) {
+        if (!m_unit->mee_dram_queue_full() && !m_CTR_queue->full() && !m_MAC_queue->full() && !m_BMT_queue->full() && !m_Ciphertext_queue->full()) {
+            // print_addr("L2 to mee: ", mf);
             if (!mf->is_write()) { // L2 read 
                 // CTR access
-                gen_CTR_mf(mf, false);
+                gen_CTR_mf(mf, META_ACC, false);
                 // Ciphertext access
-                m_Ciphertext_queue.push_back(mf);
+                m_Ciphertext_queue->push(mf);
                 // MAC access
-                gen_MAC_mf(mf, false);
+                // gen_MAC_mf(mf, false);
                 // AES Decryption
-                AES_cycle();
+                // AES_cycle();
                 // Hash MAC
                 
                 // MAC Check
                 // BMT Check
             } else { // L2 write back
                 // CTR access
-                gen_CTR_mf(mf, false);
+                gen_CTR_mf(mf, META_RBW, false);
                 // CTR update
-                gen_CTR_mf(mf, true);
+                // gen_CTR_mf(mf, META_ACC, true);
                 // AES Ecryption
+
+                // AES_queue.push(mf);
+                
                 // Ciphertext Update
+                m_Ciphertext_queue->push(mf);
                 // MAC access
-                gen_MAC_mf(mf, false);
+                // gen_MAC_mf(mf, false);
                 // MAC Hash
                 // MAC Update
-                gen_MAC_mf(mf, true);
+                // gen_MAC_mf(mf, true);
                 // BMT Update
             }
             
-            m_sub_partition->L2_mee_queue_pop();
+            m_unit->L2_mee_queue_pop(cycle&1);
             
         } else {
         }
     }
+    AES_cycle();
     CTR_cycle();
-    if (!m_Ciphertext_queue.empty() && !m_sub_partition->mee_dram_queue_full() && m_CTR_queue.empty()) {
-        mem_fetch *mf = m_Ciphertext_queue.front();
-        m_sub_partition->mee_dram_queue_push(mf);
-        m_Ciphertext_queue.pop_front();
-    }
-    MAC_cycle();
+    CT_cycle();
+    // MAC_cycle();
 }
 
-void sub_mee::cycle(unsigned cycle) {
+void mee::cycle(unsigned cycle) {
 }
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 4b12b5add..dead49240 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -12,34 +12,28 @@
 
 class mee {
     public:
-
-    private:
-
-};
-
-class sub_mee {
-    public:
-        sub_mee(class memory_sub_partition *sub_partition, class l2_cache *CTRcache, class l2_cache *MACcache, class l2_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu);
+        mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_cache *MACcache, class l2_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu);
         void cycle(unsigned cycle);
         void simple_cycle(unsigned cycle);
         void print_addr(char s[], mem_fetch *mf);
         void print_tag();
-        void meta_access(std::list<mem_fetch *> &m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr, unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc, mem_fetch *original_mf) const;
+        void meta_access(fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr, unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc, mem_fetch *original_mf) const;
         void CTR_cycle();
         void MAC_cycle();
         void BMT_cycle();
         void AES_cycle();
+        void CT_cycle();
         new_addr_type get_partition_addr(mem_fetch *mf);
         new_addr_type get_sub_partition_id(mem_fetch *mf);
         new_addr_type get_addr(new_addr_type partition_id, new_addr_type partition_addr);
 
-        void gen_CTR_mf(mem_fetch *mf, bool wr);
+        void gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr);
         void gen_MAC_mf(mem_fetch *mf, bool wr);
         // void gen_BMT_mf(mem_fetch *mf, bool wr);
         bool META_queue_empty();
 
-        void META_fill_responses(class l2_cache *m_METAcache, const new_addr_type MASK);
-        void META_fill(class l2_cache *m_METAcache, mem_fetch *mf, const new_addr_type MASK);
+        void META_fill_responses(class l2_cache *m_METAcache,  fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK);
+        void META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK);
 
         bool CTR_busy();
         bool MAC_busy();
@@ -51,17 +45,27 @@ class sub_mee {
         class l2_cache *m_CTRcache;
         class l2_cache *m_MACcache;
         class l2_cache *m_BMTcache;
-        class memory_sub_partition *m_sub_partition;
+        class memory_partition_unit *m_unit;
         const memory_config *m_config;
         class gpgpu_sim *m_gpu;
-        std::list<mem_fetch *> m_CTR_queue;
-        std::list<mem_fetch *> m_Ciphertext_queue;
-        std::list<mem_fetch *> m_MAC_queue;
-        std::list<mem_fetch *> m_BMT_queue;
+        fifo_pipeline<mem_fetch> *m_CTR_queue;
+        fifo_pipeline<mem_fetch> *m_Ciphertext_queue;
+        fifo_pipeline<mem_fetch> *m_MAC_queue;
+        fifo_pipeline<mem_fetch> *m_BMT_queue;
+
+        fifo_pipeline<mem_fetch> *m_CTR_RET_queue;
+        fifo_pipeline<mem_fetch> *m_MAC_RET_queue;
+        fifo_pipeline<mem_fetch> *m_BMT_RET_queue;
+        fifo_pipeline<mem_fetch> *m_Ciphertext_RET_queue;
+        fifo_pipeline<mem_fetch> *m_OTP_queue;
+        fifo_pipeline<mem_fetch> *m_AES_queue;
         const new_addr_type CTR_mask = 0x10000000;
         const new_addr_type MAC_mask = 0x00000000;
         const int m_memcpy_cycle_offset = 0;
         const int mee_busy_mask = 0;
 
-    
+        typedef tr1_hash_map<new_addr_type, new_addr_type> table;
+        typedef tr1_hash_map<new_addr_type, int> set;
+        table m_OTP_table;
+        set m_OTP_set;
 };
\ No newline at end of file

From 0e16e851fb5a036cfbda6a85429e78bdd4df6d34 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 18 Aug 2024 01:49:02 +0800
Subject: [PATCH 095/133] mee v0.3

---
 src/gpgpu-sim/mee.cc | 113 ++++++++++++++++++++++++++++++++++++++-----
 src/gpgpu-sim/mee.h  |   8 ++-
 2 files changed, 107 insertions(+), 14 deletions(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 7b322530f..115339534 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -20,6 +20,9 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
 
     m_OTP_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 64);
     m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+
+    m_MAC_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 64);
+    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
 }
 int decode(int addr) {
     return (addr & 16128) >> 8;
@@ -90,7 +93,7 @@ void mee::gen_MAC_mf(mem_fetch *mf, bool wr) {
     MAC_addr |= MAC_mask;
 
     meta_access(m_MAC_queue, MAC_addr, META_ACC, 
-            68, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            8, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
             mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
 }
 
@@ -126,12 +129,15 @@ void mee::CT_cycle() {
     if (!m_Ciphertext_RET_queue->empty()) {
         mem_fetch *mf_return = m_Ciphertext_RET_queue->top();
         int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
-        if (mf_return->is_write() && !m_unit->mee_L2_queue_full(spid)) { // write
+        if (mf_return->is_write()) { // write
             // print_addr("mee to L2 W:\t", mf_return);
-            m_unit->mee_L2_queue_push(spid, mf_return);
-            m_Ciphertext_RET_queue->pop();
-        } else if (!m_AES_queue->full()) {              // read
+            if (!m_unit->mee_L2_queue_full(spid)){
+                m_unit->mee_L2_queue_push(spid, mf_return);
+                m_Ciphertext_RET_queue->pop();
+            }
+        } else if (!m_AES_queue->full() && !m_MAC_HASH_queue->full()) {              // read
             m_AES_queue->push(mf_return);
+            m_MAC_HASH_queue->push(mf_return);
             m_Ciphertext_RET_queue->pop();
         }
     }
@@ -158,10 +164,11 @@ void mee::AES_cycle() {
         //     printf("%x\n", OTP_addr);
         if (m_OTP_set[OTP_addr]) {
             if (mf->is_write()) {
-                if (!m_unit->mee_dram_queue_full()) {
+                if (!m_unit->mee_dram_queue_full() && !m_MAC_HASH_queue->full()) {
                     m_OTP_set[OTP_addr]--;
                     m_OTP_table[REQ_addr] = 0;
                     m_unit->mee_dram_queue_push(mf);
+                    m_MAC_HASH_queue->push(mf);
                     m_AES_queue->pop();
                 }
             } else if (!m_unit->mee_L2_queue_full(spid)) {
@@ -185,6 +192,32 @@ void mee::AES_cycle() {
     }
 }
 
+void mee::MAC_CHECK_cycle() {
+    if (!m_MAC_CHECK_queue->empty()) {
+        // printf("AAAAAAAAAAAAA\n");
+        mem_fetch *mf = m_MAC_CHECK_queue->top();
+        new_addr_type REQ_addr = (new_addr_type) mf;
+        new_addr_type HASH_addr = m_MAC_table[REQ_addr];
+        // if (mf->get_sub_partition_id() == 0) 
+        //     printf("%x\n", OTP_addr);
+        if (true || m_MAC_set[HASH_addr]) {
+            m_MAC_set[HASH_addr]--;
+            m_MAC_table[REQ_addr] = 0;
+            m_MAC_CHECK_queue->pop();
+        }
+    }
+
+    if (!m_MAC_HASH_queue->empty()) {
+        // printf("BBBBBBBBBBBBBBB\n");
+        mem_fetch *mf = m_MAC_HASH_queue->top();
+        if (mf) {
+            m_MAC_set[(new_addr_type)mf]++;
+        }
+        // delete mf;
+        m_MAC_HASH_queue->pop();
+    }
+}
+
 void mee::CTR_cycle() {
     if (!m_CTR_RET_queue->empty()) {
         mem_fetch *mf_return = m_CTR_RET_queue->top();
@@ -248,22 +281,74 @@ void mee::CTR_cycle() {
 };
 
 void mee::MAC_cycle() {
+    if (!m_MAC_RET_queue->empty()) {
+        mem_fetch *mf_return = m_MAC_RET_queue->top();
+        if (mf_return->is_write()) {
+            m_MAC_RET_queue->pop();
+            delete mf_return;
+        } else {
+                // print_addr("MISS OTP:\t\t", mf_return);
+            if (!m_MAC_CHECK_queue->full()) {
+                // m_MAC_CHECK_queue->push(mf_return);
+                m_MAC_RET_queue->pop();
+            } else {
+                if (mf_return->get_sub_partition_id() == 1) {
+                    print_addr("MAC Full:", mf_return);
+                }
+            }
+        }
+    }
+
     m_MACcache->cycle();
-    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full()) {
+    
+    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full() && !m_MAC_CHECK_queue->full()) {
         mem_fetch *mf = m_MAC_queue->top();
+        print_addr("MAC cycle access:\t\t", mf);
+
+        if (mf->is_write()) {
+            if (!m_MAC_set[(new_addr_type)mf]) {
+                return;
+            } else {
+
+            }
+        } else {
+            m_MAC_table[(new_addr_type)mf->get_original_mf()] = (new_addr_type)mf;
+        }
+
         std::list<cache_event> events;
         enum cache_request_status status = m_MACcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
+        // print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
+            // if (!m_OTP_queue->full()) {
+            // print_addr("HIT OTP:\t\t", mf);
+            if (mf->is_write()) {
+                m_MAC_set[(new_addr_type)mf]--;
+            } else {
+                // m_MAC_CHECK_queue->push(mf);
+            }
             m_MAC_queue->pop();
+            // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
+            // print_addr("CTR cycle access:\t\t", mf);
+            if (mf->is_write()) {
+                m_MAC_set[(new_addr_type)mf]--;
+            }
             m_MAC_queue->pop();
         } else {
+            // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
+            // if (get_sub_partition_id(mf) == 0)
+            //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             assert(!write_sent);
             assert(!read_sent);
         }
+    } 
+    else {
+        if (m_unit->mee_dram_queue_full()) printf("AAAAAAAAAAAAAA\n");
+        if (m_MAC_CHECK_queue->full()) printf("BBBBBBBBBBBBBBBBB\n");
+        // if (!m_OTP_queue->empty() && m_CTR_queue->empty()) printf("CCCCCCCCCCCCCCCCCCCCCCCC\n");
     }
 };
 
@@ -275,7 +360,7 @@ void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fet
     if (m_METAcache->access_ready()) {
         mem_fetch *mf = m_METAcache->next_access();
         m_META_RET_queue->push(mf);
-        // print_addr("fill responses:", mf);
+        print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
         // delete mf;
     }
@@ -287,7 +372,7 @@ void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_MET
         if (m_METAcache->fill_port_free() && !m_META_RET_queue->full()) {
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                     m_memcpy_cycle_offset);
-            // print_addr("fill:\t\t\t\t", mf); 
+            print_addr("fill:\t\t\t\t", mf); 
             // if (mf->get_sub_partition_id() == 1) { 
             //     printf("CTR Fill: %p\n", mf);
             //     // printf("CTR Next: %p\n", m_CTR_queue->top());
@@ -300,7 +385,7 @@ void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_MET
 void mee::simple_cycle(unsigned cycle) {
     // META Cache fill responses
     META_fill_responses(m_CTRcache, m_CTR_RET_queue, CTR_mask);
-    // META_fill_responses(m_MACcache, MAC_mask);
+    META_fill_responses(m_MACcache, m_MAC_RET_queue, MAC_mask);
     // META_fill_responses(m_BMTcache);
     // dram to mee
     if (!m_unit->dram_mee_queue_empty()) {
@@ -317,6 +402,7 @@ void mee::simple_cycle(unsigned cycle) {
             // print_addr("META return to mee", mf_return);
             // delete mf_return;
             META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask);
+            META_fill(m_MACcache, m_MAC_RET_queue, mf_return, MAC_mask);
         } else {    // 密文访存返回
             // reply L2 read
             // reply L2 write back
@@ -344,7 +430,7 @@ void mee::simple_cycle(unsigned cycle) {
                 // Ciphertext access
                 m_Ciphertext_queue->push(mf);
                 // MAC access
-                // gen_MAC_mf(mf, false);
+                gen_MAC_mf(mf, false);
                 // AES Decryption
                 // AES_cycle();
                 // Hash MAC
@@ -366,7 +452,7 @@ void mee::simple_cycle(unsigned cycle) {
                 // gen_MAC_mf(mf, false);
                 // MAC Hash
                 // MAC Update
-                // gen_MAC_mf(mf, true);
+                gen_MAC_mf(mf, true);
                 // BMT Update
             }
             
@@ -378,7 +464,8 @@ void mee::simple_cycle(unsigned cycle) {
     AES_cycle();
     CTR_cycle();
     CT_cycle();
-    // MAC_cycle();
+    MAC_CHECK_cycle();
+    MAC_cycle();
 }
 
 void mee::cycle(unsigned cycle) {
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index dead49240..067b18141 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -23,6 +23,7 @@ class mee {
         void BMT_cycle();
         void AES_cycle();
         void CT_cycle();
+        void MAC_CHECK_cycle();
         new_addr_type get_partition_addr(mem_fetch *mf);
         new_addr_type get_sub_partition_id(mem_fetch *mf);
         new_addr_type get_addr(new_addr_type partition_id, new_addr_type partition_addr);
@@ -57,10 +58,13 @@ class mee {
         fifo_pipeline<mem_fetch> *m_MAC_RET_queue;
         fifo_pipeline<mem_fetch> *m_BMT_RET_queue;
         fifo_pipeline<mem_fetch> *m_Ciphertext_RET_queue;
+
         fifo_pipeline<mem_fetch> *m_OTP_queue;
         fifo_pipeline<mem_fetch> *m_AES_queue;
+        fifo_pipeline<mem_fetch> *m_MAC_HASH_queue;
+        fifo_pipeline<mem_fetch> *m_MAC_CHECK_queue;
         const new_addr_type CTR_mask = 0x10000000;
-        const new_addr_type MAC_mask = 0x00000000;
+        const new_addr_type MAC_mask = 0x20000000;
         const int m_memcpy_cycle_offset = 0;
         const int mee_busy_mask = 0;
 
@@ -68,4 +72,6 @@ class mee {
         typedef tr1_hash_map<new_addr_type, int> set;
         table m_OTP_table;
         set m_OTP_set;
+        table m_MAC_table;
+        set m_MAC_set;
 };
\ No newline at end of file

From fe66d67248bf500909773acf27f5685475c520a4 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Wed, 21 Aug 2024 00:44:32 +0800
Subject: [PATCH 096/133] mee v0.3

---
 src/gpgpu-sim/delayqueue.h |   2 +-
 src/gpgpu-sim/l2cache.cc   |   4 +
 src/gpgpu-sim/l2cache.h    |   1 +
 src/gpgpu-sim/mee.cc       | 357 +++++++++++++++++++++++++++----------
 src/gpgpu-sim/mee.h        |  54 +++++-
 5 files changed, 316 insertions(+), 102 deletions(-)

diff --git a/src/gpgpu-sim/delayqueue.h b/src/gpgpu-sim/delayqueue.h
index 4cfbc98a3..f1ad66073 100644
--- a/src/gpgpu-sim/delayqueue.h
+++ b/src/gpgpu-sim/delayqueue.h
@@ -154,7 +154,7 @@ class fifo_pipeline {
   }
 
   bool full() const { return (m_max_len && m_length >= m_max_len); }
-  bool full(int n) const { return (m_max_len && m_length + n >= m_max_len); }
+  bool full(int n) const { return (m_max_len && m_length + n > m_max_len); }
   bool is_avilable_size(unsigned size) const {
     return (m_max_len && m_length + size - 1 >= m_max_len);
   }
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index f9885a18c..133ab94b1 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -686,6 +686,10 @@ bool memory_partition_unit::mee_dram_queue_full() const {
   return m_mee_dram_queue->full(); //TODO
 }
 
+bool memory_partition_unit::mee_dram_queue_full(int size) const {
+  return m_mee_dram_queue->full(size); //TODO
+}
+
 void memory_partition_unit::mee_dram_queue_push(class mem_fetch *mf) {
   m_mee_dram_queue->push(mf); //TODO
 }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 8226c18a8..6c982bab4 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -116,6 +116,7 @@ class memory_partition_unit {
   class mem_fetch *mee_dram_queue_top() const;
   void mee_dram_queue_pop();
   bool mee_dram_queue_full() const;
+  bool mee_dram_queue_full(int size) const;
   void mee_dram_queue_push(class mem_fetch *mf);
 
   bool dram_mee_queue_empty() const;
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 115339534..c78a4fa26 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -8,27 +8,33 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
-    m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
-    m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
-    m_OTP_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 64);
-    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_OTP_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
+    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
-    m_MAC_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 64);
-    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_MAC_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
+    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+
+    m_BMT_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
+    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+
+    BMT_busy = false;
 }
 int decode(int addr) {
     return (addr & 16128) >> 8;
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
-    if (mf->get_sub_partition_id() == 0) {
+    if (mf->get_sub_partition_id() == 1) {
         printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\tmask_id: %d\tmask_addr:%x\taccess type:%d\n", s, mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), get_sub_partition_id(mf), get_partition_addr(mf), mf->get_access_type());
         // print_tag();
     }
@@ -61,6 +67,17 @@ new_addr_type mee::get_sub_partition_id(mem_fetch *mf) {
     return mf->get_sub_partition_id();
 }
 
+unsigned int mee::get_BMT_Layer(new_addr_type addr) {
+    for (int i = 0; i <= 4; i++) {
+        if ((addr & BMT_mask[i]) == BMT_base[i]) {
+            return i;
+        }
+    }
+    // if (addr == BMT_ROOT_mf)
+    //     return 5;
+    return 5;
+}
+
 bool mee::META_queue_empty() {
     return m_CTR_queue->empty() && m_Ciphertext_queue->empty() && m_MAC_queue->empty();
 }
@@ -76,9 +93,9 @@ new_addr_type mee::get_addr(new_addr_type sub_partition_id, new_addr_type partit
 void mee::gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
-    partition_addr = partition_addr >> 18 << 7;
+    partition_addr = partition_addr >> 14 << 7;
     new_addr_type CTR_addr  = get_addr(sub_partition_id, partition_addr);
-    CTR_addr |= CTR_mask;
+    CTR_addr |= CTR_base;
 
     meta_access(m_CTR_queue, CTR_addr, meta_acc, 
             128, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
@@ -90,13 +107,31 @@ void mee::gen_MAC_mf(mem_fetch *mf, bool wr) {
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
     partition_addr = partition_addr >> 7 << 3;
     new_addr_type MAC_addr  = get_addr(sub_partition_id, partition_addr);
-    MAC_addr |= MAC_mask;
+    MAC_addr |= MAC_base;
 
     meta_access(m_MAC_queue, MAC_addr, META_ACC, 
             8, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
             mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
 }
 
+void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size) {
+    new_addr_type partition_addr = get_partition_addr(mf);
+    new_addr_type sub_partition_id = get_sub_partition_id(mf);
+    unsigned int Layer = get_BMT_Layer(mf->get_addr());
+    if (Layer == 4) //由L4生成ROOT，由于ROOT是单独的寄存器，这里不生成访存请求
+        return;
+    partition_addr = partition_addr & 0x003fffff;
+    partition_addr = partition_addr >> 7 << 3;
+    new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
+    BMT_addr |= 0xf2000000;
+
+    // printf("%llx %llx\n", mf->get_addr(), BMT_addr);
+
+    meta_access(m_BMT_queue, BMT_addr, type, 
+            size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
+}
+
 void mee::meta_access(
         fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr,
         unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
@@ -105,7 +140,11 @@ void mee::meta_access(
     mem_access_byte_mask_t byte_mask;
     mem_access_sector_mask_t sector_mask;
     for (unsigned i = 0; i < size; i++) byte_mask.set(i);
-    for (unsigned i = 0; i < size/32; i++) sector_mask.set(i + (addr & (1 << 7) ? 2 : 0));
+    if (size == 128)
+        for (unsigned i = 0; i < size / 32; i++) 
+            sector_mask.set(i);
+    else
+        sector_mask.set((addr >> 5) & 3);
 
     mem_access_t acc(type, addr, size, wr, original_mf->get_access_warp_mask(), byte_mask, sector_mask, m_gpu->gpgpu_ctx);
     mem_fetch *mf = new mem_fetch(
@@ -132,23 +171,25 @@ void mee::CT_cycle() {
         if (mf_return->is_write()) { // write
             // print_addr("mee to L2 W:\t", mf_return);
             if (!m_unit->mee_L2_queue_full(spid)){
-                m_unit->mee_L2_queue_push(spid, mf_return);
+                m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
                 m_Ciphertext_RET_queue->pop();
             }
         } else if (!m_AES_queue->full() && !m_MAC_HASH_queue->full()) {              // read
-            m_AES_queue->push(mf_return);
-            m_MAC_HASH_queue->push(mf_return);
+            m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
+            m_MAC_HASH_queue->push(mf_return);  //对密文进行hash，用于MAC Check
             m_Ciphertext_RET_queue->pop();
         }
     }
 
     if (!m_Ciphertext_queue->empty()) {
         mem_fetch *mf = m_Ciphertext_queue->top();
-        if (mf->is_write() && !m_AES_queue->full()) { // write
-            m_AES_queue->push(mf);
-            m_Ciphertext_queue->pop();
+        if (mf->is_write()) { // write
+            if (!m_AES_queue->full()) {
+                m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
+                m_Ciphertext_queue->pop();
+            }
         } else if (!m_unit->mee_dram_queue_full()) {              // read
-            m_unit->mee_dram_queue_push(mf);
+            m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
             m_Ciphertext_queue->pop();
         }
     }
@@ -157,25 +198,25 @@ void mee::CT_cycle() {
 void mee::AES_cycle() {
     if (!m_AES_queue->empty()) {
         mem_fetch *mf = m_AES_queue->top();
-        new_addr_type REQ_addr = (new_addr_type) mf;
-        new_addr_type OTP_addr = m_OTP_table[REQ_addr];
+        new_addr_type REQ_addr = (new_addr_type) mf;    //加密/解密请求的明文/密文
+        new_addr_type OTP_addr = m_OTP_table[REQ_addr]; //OTP
         int spid = m_unit->global_sub_partition_id_to_local_id(mf->get_sub_partition_id());
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
-        if (m_OTP_set[OTP_addr]) {
-            if (mf->is_write()) {
+        if (m_OTP_set[OTP_addr]) {  // 得到了OTP和明文/密文，AES加密/解密完成 
+            if (mf->is_write()) {   //加密
                 if (!m_unit->mee_dram_queue_full() && !m_MAC_HASH_queue->full()) {
                     m_OTP_set[OTP_addr]--;
                     m_OTP_table[REQ_addr] = 0;
-                    m_unit->mee_dram_queue_push(mf);
-                    m_MAC_HASH_queue->push(mf);
+                    m_unit->mee_dram_queue_push(mf);    //加密完后更新DRAM中的密文
+                    m_MAC_HASH_queue->push(mf);         //加密完后得到密文，对密文进行MAC Hash
                     m_AES_queue->pop();
                 }
-            } else if (!m_unit->mee_L2_queue_full(spid)) {
+            } else if (!m_unit->mee_L2_queue_full(spid)) {  //解密
                 m_OTP_set[OTP_addr]--;
                 m_OTP_table[REQ_addr] = 0;
                 // print_addr("mee to L2 R:\t", mf);
-                m_unit->mee_L2_queue_push(spid, mf);
+                m_unit->mee_L2_queue_push(spid, mf);    //解密完后返回L2
                 m_AES_queue->pop();
                 
             }
@@ -185,9 +226,9 @@ void mee::AES_cycle() {
     if (!m_OTP_queue->empty()){
         mem_fetch *mf = m_OTP_queue->top();
         if (mf) {
-            m_OTP_set[(new_addr_type)mf]++;
+            m_OTP_set[(new_addr_type)mf]++; //OTP计算完成
         }
-        delete mf;
+        // delete mf;
         m_OTP_queue->pop();
     }
 }
@@ -196,11 +237,11 @@ void mee::MAC_CHECK_cycle() {
     if (!m_MAC_CHECK_queue->empty()) {
         // printf("AAAAAAAAAAAAA\n");
         mem_fetch *mf = m_MAC_CHECK_queue->top();
-        new_addr_type REQ_addr = (new_addr_type) mf;
-        new_addr_type HASH_addr = m_MAC_table[REQ_addr];
+        new_addr_type REQ_addr = (new_addr_type) mf;    //MAC Cache的值
+        new_addr_type HASH_addr = m_MAC_table[REQ_addr];    //MAC Hash值
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
-        if (true || m_MAC_set[HASH_addr]) {
+        if (m_MAC_set[HASH_addr]) { //得到了MAC与Hash值，MAC Check完成
             m_MAC_set[HASH_addr]--;
             m_MAC_table[REQ_addr] = 0;
             m_MAC_CHECK_queue->pop();
@@ -211,24 +252,96 @@ void mee::MAC_CHECK_cycle() {
         // printf("BBBBBBBBBBBBBBB\n");
         mem_fetch *mf = m_MAC_HASH_queue->top();
         if (mf) {
-            m_MAC_set[(new_addr_type)mf]++;
+            m_MAC_set[(new_addr_type)mf]++; //MAC Hash计算完成
         }
         // delete mf;
         m_MAC_HASH_queue->pop();
     }
 }
 
+void mee::BMT_CHECK_cycle() {
+    if (!m_BMT_CHECK_queue->empty()) {
+        // printf("AAAAAAAAAAAAA\n");
+        mem_fetch *mf = m_BMT_CHECK_queue->top();
+        new_addr_type REQ_addr = (new_addr_type) mf;    //BMT Cache的值
+        new_addr_type HASH_addr = m_BMT_table[REQ_addr];    //BMT Hash值
+        // if (mf->get_sub_partition_id() == 0) 
+        //     printf("%x\n", OTP_addr);
+        // assert(mf);
+        if (m_BMT_set[HASH_addr] && !m_BMT_queue->full(2) && !m_BMT_HASH_queue->full()) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
+            m_BMT_set[HASH_addr]--;
+            m_BMT_table[REQ_addr] = 0;
+            m_BMT_table.erase(m_BMT_table.find(REQ_addr));
+            m_BMT_CHECK_queue->pop();
+            // BMT_busy = false;
+            // print_addr("BMT Check finish:", mf);
+            // if (mf->get_sub_partition_id() == 1) 
+            //     printf("%d %d\n", m_BMT_table.size(), m_BMT_table.empty());
+            
+            // printf("BBBBBB");
+            //计算下一层BMT
+            if (!mf || get_BMT_Layer(mf->get_addr()) == 5) {
+                // printf("AAAAAAAAAAAA\n");
+                BMT_busy = false;
+                cnt--;
+            } else if (get_BMT_Layer(mf->get_addr()) == 4) {
+                // printf("AAAAAAAAAAAA\n");
+                m_BMT_CHECK_queue->push(BMT_ROOT_mf);
+                m_BMT_table[(new_addr_type) BMT_ROOT_mf] = (new_addr_type) mf;
+                m_BMT_HASH_queue->push(mf);
+            } else {
+                // gen_BMT_mf(mf, false, META_RBW, 128);
+                gen_BMT_mf(mf, mf->is_write(), META_ACC, 8);
+                m_BMT_HASH_queue->push(mf);
+            }
+            
+            // if (REQ_addr == (new_addr_type) BMT_ROOT_mf) {
+            //     printf("AAAAAAAAAAAA\n");
+            //     BMT_busy = false;
+            // }
+        }
+    }
+
+    if (!m_BMT_HASH_queue->empty()) {
+        // printf("BBBBBBBBBBBBBBB\n");
+        mem_fetch *mf = m_BMT_HASH_queue->top();
+        if (mf) {
+            m_BMT_set[(new_addr_type)mf]++; //BMT Hash计算完成
+        }
+        // delete mf;
+        m_BMT_HASH_queue->pop();
+    }
+
+    // CTR to BMT
+    if (!m_CTR_BMT_Buffer->empty() && !m_BMT_queue->full() && !m_BMT_HASH_queue->full() && !BMT_busy && m_BMT_table.empty()) {
+        assert(cnt==0);
+        // assert(cnt);
+        mem_fetch *mf = m_CTR_BMT_Buffer->top();
+            gen_BMT_mf(mf, mf->is_write(), META_ACC, 8);
+            m_BMT_HASH_queue->push(mf);
+            m_CTR_BMT_Buffer->pop();
+            BMT_busy = true;
+            cnt++;
+    }
+}
+
 void mee::CTR_cycle() {
     if (!m_CTR_RET_queue->empty()) {
         mem_fetch *mf_return = m_CTR_RET_queue->top();
-        if (mf_return->get_type() == META_RBW) {
+        if (mf_return->get_type() == META_RBW) {    //更新CTR前的CTR读MISS返回
+            // if (!m_CTR_queue->full()) {
             m_CTR_RET_queue->pop();
-            gen_CTR_mf(mf_return->get_original_mf(), META_ACC, true);
-            delete mf_return;
-        } else {
+            //     gen_CTR_mf(mf_return->get_original_mf(), META_ACC, true);   //更新CTR，生成写CTR的请求
+            // }
+            // else {
+            //     assert(!m_CTR_RET_queue->full());
+            // }
+            // delete mf_return;//删除1
+        } else {    //CTR读MISS返回，CTR写一定命中
                 // print_addr("MISS OTP:\t\t", mf_return);
-            if (!m_OTP_queue->full()) {
-                m_OTP_queue->push(mf_return);
+            if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
+                m_OTP_queue->push(mf_return);   //得到CTR值，计算OTP用于解密
+                // m_CTR_BMT_Buffer->push(mf_return);
                 m_CTR_RET_queue->pop();
             }
         }
@@ -236,18 +349,21 @@ void mee::CTR_cycle() {
 
     m_CTRcache->cycle();
     
-    if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full() && !m_OTP_queue->full()) {
+    bool output_full = m_OTP_queue->full() || m_CTR_RET_queue->full() || m_CTR_BMT_Buffer->full();
+    bool port_free = m_unit->m_MACcache->data_port_free();
+
+    if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
         mem_fetch *mf = m_CTR_queue->top();
         // print_addr("CTR cycle access:\t\t", mf);
 
         if (mf->is_write()) {
-            if (m_CTRcache->probe(mf->get_addr(), mf) != HIT) {
+            if (m_CTRcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
                 return;
             }
         }
 
         if (mf->get_type() != META_RBW) {
-            m_OTP_table[(new_addr_type)mf->get_original_mf()] = (new_addr_type)mf;
+            m_OTP_table[(new_addr_type)mf->get_original_mf()] = (new_addr_type)mf;  //生成<加密/解密, OTP>任务
         }
 
         std::list<cache_event> events;
@@ -258,8 +374,11 @@ void mee::CTR_cycle() {
         if (status == HIT) {
             // if (!m_OTP_queue->full()) {
             // print_addr("HIT OTP:\t\t", mf);
-            m_OTP_queue->push(mf);
+            m_OTP_queue->push(mf);  //CTR HIT后计算OTP用于加密/解密
             m_CTR_queue->pop();
+            if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
+                // m_CTR_BMT_Buffer->push(mf);
+            }
             // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
@@ -283,36 +402,32 @@ void mee::CTR_cycle() {
 void mee::MAC_cycle() {
     if (!m_MAC_RET_queue->empty()) {
         mem_fetch *mf_return = m_MAC_RET_queue->top();
-        if (mf_return->is_write()) {
+        if (mf_return->is_write()) {    //写MAC完成
             m_MAC_RET_queue->pop();
-            delete mf_return;
-        } else {
-                // print_addr("MISS OTP:\t\t", mf_return);
+            // delete mf_return;//删除2
+        } else {    //MAC读MISS返回
             if (!m_MAC_CHECK_queue->full()) {
-                // m_MAC_CHECK_queue->push(mf_return);
+                m_MAC_CHECK_queue->push(mf_return); //MAC读MISS完成，得到MAC值，发往MAC Check
                 m_MAC_RET_queue->pop();
-            } else {
-                if (mf_return->get_sub_partition_id() == 1) {
-                    print_addr("MAC Full:", mf_return);
-                }
             }
         }
     }
 
     m_MACcache->cycle();
+
+    bool output_full = m_MAC_CHECK_queue->full();
+    bool port_free = m_unit->m_MACcache->data_port_free();
     
-    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full() && !m_MAC_CHECK_queue->full()) {
+    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
         mem_fetch *mf = m_MAC_queue->top();
-        print_addr("MAC cycle access:\t\t", mf);
+        // print_addr("MAC cycle access:\t\t", mf);
 
-        if (mf->is_write()) {
-            if (!m_MAC_set[(new_addr_type)mf]) {
+        if (mf->is_write()) {   //对于写MAC请求，则应等待密文被Hash为新MAC值
+            if (!m_MAC_set[(new_addr_type)mf->get_original_mf()]) {
                 return;
-            } else {
-
             }
-        } else {
-            m_MAC_table[(new_addr_type)mf->get_original_mf()] = (new_addr_type)mf;
+        } else {    //对于读MAC请求，生成<MAC，Hash(密文)>的MAC Check任务
+            m_MAC_table[(new_addr_type)mf] = (new_addr_type)mf->get_original_mf();
         }
 
         std::list<cache_event> events;
@@ -323,18 +438,18 @@ void mee::MAC_cycle() {
         if (status == HIT) {
             // if (!m_OTP_queue->full()) {
             // print_addr("HIT OTP:\t\t", mf);
-            if (mf->is_write()) {
-                m_MAC_set[(new_addr_type)mf]--;
+            if (mf->is_write()) {   //MAC写HIT，则MAC Hash值使用结束
+                m_MAC_set[(new_addr_type)mf->get_original_mf()]--;
             } else {
-                // m_MAC_CHECK_queue->push(mf);
+                m_MAC_CHECK_queue->push(mf);    //MAC读HIT，得到MAC值，发往MAC Check
             }
             m_MAC_queue->pop();
             // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
             // print_addr("CTR cycle access:\t\t", mf);
-            if (mf->is_write()) {
-                m_MAC_set[(new_addr_type)mf]--;
+            if (mf->is_write()) {   //MAC写MISS，则MAC Hash值使用结束
+                m_MAC_set[(new_addr_type)mf->get_original_mf()]--;
             }
             m_MAC_queue->pop();
         } else {
@@ -344,35 +459,80 @@ void mee::MAC_cycle() {
             assert(!write_sent);
             assert(!read_sent);
         }
-    } 
-    else {
-        if (m_unit->mee_dram_queue_full()) printf("AAAAAAAAAAAAAA\n");
-        if (m_MAC_CHECK_queue->full()) printf("BBBBBBBBBBBBBBBBB\n");
-        // if (!m_OTP_queue->empty() && m_CTR_queue->empty()) printf("CCCCCCCCCCCCCCCCCCCCCCCC\n");
     }
 };
 
 void mee::BMT_cycle() {
+    if (!m_BMT_RET_queue->empty() && !m_BMT_CHECK_queue->full()) {
+        mem_fetch *mf_return = m_BMT_RET_queue->top();
+        // print_addr("MISS OTP:\t\t", mf_return);
+        if (mf_return->get_type() != META_RBW)
+            m_BMT_CHECK_queue->push(mf_return);
+        m_BMT_RET_queue->pop();
+    }
+
+    m_BMTcache->cycle();
+    
+    if (!m_BMT_queue->empty() && !m_unit->mee_dram_queue_full() && !m_BMT_CHECK_queue->full()) {
+        mem_fetch *mf = m_BMT_queue->top();
+        // print_addr("MAC cycle access:\t\t", mf);
+
+        if (mf->is_write()) {
+            //对于BMT写，要等待上一层BMT Hash计算完，得到新的BMT值，才可以更新当前层BMT
+            if (!m_BMT_set[(new_addr_type)mf->get_original_mf()]) { 
+                return;
+            }
+        }
+        
+        if (mf->get_type() != META_RBW)
+            m_BMT_table[(new_addr_type)mf] = (new_addr_type)mf->get_original_mf();
 
+        std::list<cache_event> events;
+        enum cache_request_status status = m_BMTcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+        bool write_sent = was_write_sent(events);
+        bool read_sent = was_read_sent(events);
+        // print_addr("CTR cycle access:\t\t", mf);
+        if (mf->get_type() == META_RBW) {
+            assert(status == HIT);
+        }
+        if (status == HIT) {
+            if (mf->get_type() != META_RBW)
+                m_BMT_CHECK_queue->push(mf);
+            m_BMT_queue->pop();
+        } else if (status != RESERVATION_FAIL) {
+            m_BMT_queue->pop();
+        } else {
+            // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
+            // if (get_sub_partition_id(mf) == 0)
+            //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+            assert(!write_sent);
+            assert(!read_sent);
+        }
+    }
 };
 
 void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK) {
-    if (m_METAcache->access_ready()) {
+    if (m_METAcache->access_ready() && !m_META_RET_queue->full()) {
         mem_fetch *mf = m_METAcache->next_access();
         m_META_RET_queue->push(mf);
-        print_addr("fill responses:", mf);
+        if (m_METAcache == m_BMTcache)
+            print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
         // delete mf;
     }
 }
 
-void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK) {
-    if ((mf->get_addr() & MASK) && m_METAcache->waiting_for_fill(mf)) {
+void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE) {
+    // if (m_METAcache == m_BMTcache) printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE);
+    
+    if (((mf->get_addr() & MASK) == BASE) && m_METAcache->waiting_for_fill(mf)) {
         // print_addr("wating for fill:\t\t", mf); 
-        if (m_METAcache->fill_port_free() && !m_META_RET_queue->full()) {
+        if (m_METAcache->fill_port_free()) {
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                     m_memcpy_cycle_offset);
-            print_addr("fill:\t\t\t\t", mf); 
+            if (m_METAcache == m_BMTcache)
+                print_addr("fill:\t\t\t\t", mf);
+                // printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE); 
             // if (mf->get_sub_partition_id() == 1) { 
             //     printf("CTR Fill: %p\n", mf);
             //     // printf("CTR Next: %p\n", m_CTR_queue->top());
@@ -383,9 +543,19 @@ void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_MET
 }
 
 void mee::simple_cycle(unsigned cycle) {
+    MAC_CHECK_cycle();
+    MAC_cycle();
+    BMT_CHECK_cycle();
+    BMT_cycle();
+    AES_cycle();
+    CTR_cycle();
+    CT_cycle();
     // META Cache fill responses
     META_fill_responses(m_CTRcache, m_CTR_RET_queue, CTR_mask);
     META_fill_responses(m_MACcache, m_MAC_RET_queue, MAC_mask);
+    for (int layer = 1; layer <= 4; layer++){
+        META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[layer]);
+    }
     // META_fill_responses(m_BMTcache);
     // dram to mee
     if (!m_unit->dram_mee_queue_empty()) {
@@ -401,8 +571,11 @@ void mee::simple_cycle(unsigned cycle) {
             // printf("Success handle CTR_ACC: ");
             // print_addr("META return to mee", mf_return);
             // delete mf_return;
-            META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask);
-            META_fill(m_MACcache, m_MAC_RET_queue, mf_return, MAC_mask);
+            META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask, CTR_base);
+            META_fill(m_MACcache, m_MAC_RET_queue, mf_return, MAC_mask, MAC_base);
+            for (int layer = 1; layer <= 4; layer++){
+                META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[layer], BMT_base[layer]);
+            }
         } else {    // 密文访存返回
             // reply L2 read
             // reply L2 write back
@@ -422,7 +595,7 @@ void mee::simple_cycle(unsigned cycle) {
         mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
         // print_addr("L2 to mee: ", mf);
         // mee to dram
-        if (!m_unit->mee_dram_queue_full() && !m_CTR_queue->full() && !m_MAC_queue->full() && !m_BMT_queue->full() && !m_Ciphertext_queue->full()) {
+        if (!m_unit->mee_dram_queue_full() && !m_CTR_queue->full(2) && !m_MAC_queue->full() && !m_BMT_queue->full() && !m_Ciphertext_queue->full()) {
             // print_addr("L2 to mee: ", mf);
             if (!mf->is_write()) { // L2 read 
                 // CTR access
@@ -441,7 +614,7 @@ void mee::simple_cycle(unsigned cycle) {
                 // CTR access
                 gen_CTR_mf(mf, META_RBW, false);
                 // CTR update
-                // gen_CTR_mf(mf, META_ACC, true);
+                gen_CTR_mf(mf, META_ACC, true);
                 // AES Ecryption
 
                 // AES_queue.push(mf);
@@ -458,15 +631,15 @@ void mee::simple_cycle(unsigned cycle) {
             
             m_unit->L2_mee_queue_pop(cycle&1);
             
-        } else {
         }
     }
-    AES_cycle();
-    CTR_cycle();
-    CT_cycle();
-    MAC_CHECK_cycle();
-    MAC_cycle();
 }
 
 void mee::cycle(unsigned cycle) {
-}
\ No newline at end of file
+}
+
+//BMT next Layer
+//BMT buzy
+//BMT erase
+//BMT write需要阻塞，CTR read可以连续访问 
+//BMT 写前读 ok
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 067b18141..bbf6acc1e 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -24,21 +24,23 @@ class mee {
         void AES_cycle();
         void CT_cycle();
         void MAC_CHECK_cycle();
+        void BMT_CHECK_cycle();
         new_addr_type get_partition_addr(mem_fetch *mf);
         new_addr_type get_sub_partition_id(mem_fetch *mf);
         new_addr_type get_addr(new_addr_type partition_id, new_addr_type partition_addr);
 
+        unsigned int get_BMT_Layer(new_addr_type addr);
         void gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr);
         void gen_MAC_mf(mem_fetch *mf, bool wr);
-        // void gen_BMT_mf(mem_fetch *mf, bool wr);
+        void gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size);
         bool META_queue_empty();
 
         void META_fill_responses(class l2_cache *m_METAcache,  fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK);
-        void META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK);
+        void META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE);
 
         bool CTR_busy();
         bool MAC_busy();
-        bool BMT_busy();
+        bool BMT_busy;
         
 
         
@@ -63,15 +65,49 @@ class mee {
         fifo_pipeline<mem_fetch> *m_AES_queue;
         fifo_pipeline<mem_fetch> *m_MAC_HASH_queue;
         fifo_pipeline<mem_fetch> *m_MAC_CHECK_queue;
-        const new_addr_type CTR_mask = 0x10000000;
-        const new_addr_type MAC_mask = 0x20000000;
+
+        //m_CTR_BMT_Buffer-->m_BMT_CHECK_queue--|-->
+        //                |->m_BMT_HASH_queue---|
+        //              m_BMT_queue-->m_BMT_RET_queue-->
+        fifo_pipeline<mem_fetch> *m_BMT_CHECK_queue;
+        fifo_pipeline<mem_fetch> *m_BMT_HASH_queue;
+        fifo_pipeline<mem_fetch> *m_CTR_BMT_Buffer;
+
+        //CTR: 1111 1110 0000 0000 0000 0000 0000 0000
+        //L1 : 1111 1111 1110 0000 0000 0000 0000 0000
+        //L2 : 1111 1111 1111 1110 0000 0000 0000 0000
+        //L3 : 1111 1111 1111 1111 1100 0000 1000 0000
+        //L4 : 1111 1111 1111 1111 1100 0000 1111 1000
+        //ROOT:1111 1111 1111 1111 1100 0000 1111 1000 
+        const new_addr_type BMT_mask[5] = {0xFE000000, 0xFFE00000, 0xFFFE0000, 0xFFFFC080, 0xFFFFC0F8};
+        
+        const new_addr_type CTR_mask = 0xFE000000;//1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
+        const new_addr_type MAC_mask = 0xF0000000;//1110 xxxx xxxx xxxx xxxx xxxx xxxx x000
+        
+        //CTR: 1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
+        //L1 : 1111 0010 000x xxxx xxxx xxxx xxxx x000
+        //L2 : 1111 0010 0010 000x xxxx xxxx xxxx x000
+        //L3 : 1111 0010 0010 0010 00xx xxxx 0xxx x000
+        //L4 : 1111 0010 0010 0010 00xx xxxx 1000 0000
+        //ROOT:1111 0010 0010 0010 00xx xxxx 1000 1000 
+        const new_addr_type BMT_base[5] = {0xF0000000, 0xF2000000, 0xF2200000, 0xF2220000, 0xF2220080};
+        
+        const new_addr_type CTR_base = 0xF0000000;//1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
+        const new_addr_type MAC_base = 0xE0000000;//1110 xxxx xxxx xxxx xxxx xxxx xxxx x000
+
         const int m_memcpy_cycle_offset = 0;
         const int mee_busy_mask = 0;
 
         typedef tr1_hash_map<new_addr_type, new_addr_type> table;
         typedef tr1_hash_map<new_addr_type, int> set;
-        table m_OTP_table;
-        set m_OTP_set;
-        table m_MAC_table;
-        set m_MAC_set;
+        table m_OTP_table;  //<密文，OTP(CTR)>
+        set m_OTP_set;  //<OTP(CTR), cnt>
+        table m_MAC_table;  //<MAC, hash(密文)>
+        set m_MAC_set;      //<hash(密文), cnt>
+        table m_BMT_table;  //<BMT, hash(CTR/LBMT)>
+        set m_BMT_set;      //<hash, cnt>
+        //1111 1111 1111 1111 1100 0000 1111 1000
+        mem_fetch *BMT_ROOT_mf = NULL;
+        int cnt = 0;
+
 };
\ No newline at end of file

From 845a4c24e54e2bd114b2e687188c2828df632a4d Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Wed, 21 Aug 2024 02:50:49 +0800
Subject: [PATCH 097/133] mee v0.4

---
 src/gpgpu-sim/mee.cc | 76 +++++++++++++++++++++++++++++---------------
 src/gpgpu-sim/mee.h  |  1 +
 2 files changed, 52 insertions(+), 25 deletions(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index c78a4fa26..14556e03c 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -11,11 +11,11 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
     m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
     m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 80);
 
     m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
     m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 80);
     m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
     m_OTP_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
@@ -24,9 +24,9 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_MAC_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
     m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
-    m_BMT_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
-    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_BMT_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 180);
+    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 80);
+    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, 80);
 
     BMT_busy = false;
 }
@@ -159,7 +159,9 @@ void mee::meta_access(
         reqs.push_back(mf);
 
     for (unsigned i = 0; i < reqs.size(); ++i) {
+        assert(reqs.size() == 1);
         mem_fetch *req = reqs[i];
+        assert(!m_META_queue->full());
         m_META_queue->push(req);
     }
 }
@@ -281,17 +283,24 @@ void mee::BMT_CHECK_cycle() {
             // printf("BBBBBB");
             //计算下一层BMT
             if (!mf || get_BMT_Layer(mf->get_addr()) == 5) {
-                // printf("AAAAAAAAAAAA\n");
+                printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
                 cnt--;
             } else if (get_BMT_Layer(mf->get_addr()) == 4) {
-                // printf("AAAAAAAAAAAA\n");
+                printf("AAAAAAAAAAAA\n");
+                assert(!m_BMT_CHECK_queue->full());
                 m_BMT_CHECK_queue->push(BMT_ROOT_mf);
                 m_BMT_table[(new_addr_type) BMT_ROOT_mf] = (new_addr_type) mf;
+                assert(!m_BMT_HASH_queue->full());
                 m_BMT_HASH_queue->push(mf);
             } else {
-                // gen_BMT_mf(mf, false, META_RBW, 128);
+                printf("XXXXXXXXXXXXX\n");
+                assert(get_BMT_Layer(mf->get_addr()) && get_BMT_Layer(mf->get_addr())<4);
+                assert(!m_BMT_queue->full(2));
+                gen_BMT_mf(mf->get_original_mf(), false, META_RBW, 128);
+                assert(!m_BMT_queue->full());
                 gen_BMT_mf(mf, mf->is_write(), META_ACC, 8);
+                assert(!m_BMT_HASH_queue->full());
                 m_BMT_HASH_queue->push(mf);
             }
             
@@ -328,7 +337,7 @@ void mee::BMT_CHECK_cycle() {
 void mee::CTR_cycle() {
     if (!m_CTR_RET_queue->empty()) {
         mem_fetch *mf_return = m_CTR_RET_queue->top();
-        if (mf_return->get_type() == META_RBW) {    //更新CTR前的CTR读MISS返回
+        if (mf_return->get_access_type() == META_RBW) {    //更新CTR前的CTR读MISS返回
             // if (!m_CTR_queue->full()) {
             m_CTR_RET_queue->pop();
             //     gen_CTR_mf(mf_return->get_original_mf(), META_ACC, true);   //更新CTR，生成写CTR的请求
@@ -338,10 +347,11 @@ void mee::CTR_cycle() {
             // }
             // delete mf_return;//删除1
         } else {    //CTR读MISS返回，CTR写一定命中
+            assert(!mf_return->is_write());
                 // print_addr("MISS OTP:\t\t", mf_return);
             if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
                 m_OTP_queue->push(mf_return);   //得到CTR值，计算OTP用于解密
-                // m_CTR_BMT_Buffer->push(mf_return);
+                m_CTR_BMT_Buffer->push(mf_return);
                 m_CTR_RET_queue->pop();
             }
         }
@@ -362,7 +372,7 @@ void mee::CTR_cycle() {
             }
         }
 
-        if (mf->get_type() != META_RBW) {
+        if (mf->get_access_type() != META_RBW) {
             m_OTP_table[(new_addr_type)mf->get_original_mf()] = (new_addr_type)mf;  //生成<加密/解密, OTP>任务
         }
 
@@ -377,7 +387,7 @@ void mee::CTR_cycle() {
             m_OTP_queue->push(mf);  //CTR HIT后计算OTP用于加密/解密
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
-                // m_CTR_BMT_Buffer->push(mf);
+                m_CTR_BMT_Buffer->push(mf);
             }
             // }
         } else if (status != RESERVATION_FAIL) {
@@ -415,7 +425,7 @@ void mee::MAC_cycle() {
 
     m_MACcache->cycle();
 
-    bool output_full = m_MAC_CHECK_queue->full();
+    bool output_full = m_MAC_CHECK_queue->full() || m_MAC_RET_queue->full();// && 
     bool port_free = m_unit->m_MACcache->data_port_free();
     
     if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
@@ -463,19 +473,33 @@ void mee::MAC_cycle() {
 };
 
 void mee::BMT_cycle() {
-    if (!m_BMT_RET_queue->empty() && !m_BMT_CHECK_queue->full()) {
+    if (!m_BMT_RET_queue->empty()) {
         mem_fetch *mf_return = m_BMT_RET_queue->top();
         // print_addr("MISS OTP:\t\t", mf_return);
-        if (mf_return->get_type() != META_RBW)
-            m_BMT_CHECK_queue->push(mf_return);
-        m_BMT_RET_queue->pop();
+        if (mf_return->get_access_type() != META_RBW) {
+            if (!m_BMT_CHECK_queue->full()) {
+                m_BMT_CHECK_queue->push(mf_return);
+                m_BMT_RET_queue->pop();
+            }
+        } else {
+            m_BMT_RET_queue->pop();
+        }
     }
 
     m_BMTcache->cycle();
+
+    bool output_full = m_BMT_CHECK_queue->full() || m_BMT_RET_queue->full();
+    bool port_free = m_unit->m_BMTcache->data_port_free();
     
-    if (!m_BMT_queue->empty() && !m_unit->mee_dram_queue_full() && !m_BMT_CHECK_queue->full()) {
+    if (!m_BMT_queue->empty()) {
+        mem_fetch *mf = m_BMT_queue->top();
+        // assert(mf->get_access_type() == META_RBW);
+    }
+
+    if (!m_BMT_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
         mem_fetch *mf = m_BMT_queue->top();
         // print_addr("MAC cycle access:\t\t", mf);
+        // assert(mf->get_access_type() == mf->get_access_type());
 
         if (mf->is_write()) {
             //对于BMT写，要等待上一层BMT Hash计算完，得到新的BMT值，才可以更新当前层BMT
@@ -484,7 +508,7 @@ void mee::BMT_cycle() {
             }
         }
         
-        if (mf->get_type() != META_RBW)
+        if (mf->get_access_type() != META_RBW)
             m_BMT_table[(new_addr_type)mf] = (new_addr_type)mf->get_original_mf();
 
         std::list<cache_event> events;
@@ -492,19 +516,16 @@ void mee::BMT_cycle() {
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
         // print_addr("CTR cycle access:\t\t", mf);
-        if (mf->get_type() == META_RBW) {
+        if (mf->get_access_type() == META_RBW) {
             assert(status == HIT);
         }
         if (status == HIT) {
-            if (mf->get_type() != META_RBW)
+            if (mf->get_access_type() != META_RBW)
                 m_BMT_CHECK_queue->push(mf);
             m_BMT_queue->pop();
         } else if (status != RESERVATION_FAIL) {
             m_BMT_queue->pop();
         } else {
-            // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
-            // if (get_sub_partition_id(mf) == 0)
-            //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             assert(!write_sent);
             assert(!read_sent);
         }
@@ -542,7 +563,12 @@ void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_MET
     }
 }
 
+void mee::pr(fifo_pipeline<mem_fetch> *m_META_RET_queue) {
+    printf("%d\n",m_META_RET_queue->get_length());
+}
+
 void mee::simple_cycle(unsigned cycle) {
+    // pr(m_CTR_BMT_Buffer);
     MAC_CHECK_cycle();
     MAC_cycle();
     BMT_CHECK_cycle();
@@ -595,7 +621,7 @@ void mee::simple_cycle(unsigned cycle) {
         mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
         // print_addr("L2 to mee: ", mf);
         // mee to dram
-        if (!m_unit->mee_dram_queue_full() && !m_CTR_queue->full(2) && !m_MAC_queue->full() && !m_BMT_queue->full() && !m_Ciphertext_queue->full()) {
+        if (!m_CTR_queue->full(2) && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
             // print_addr("L2 to mee: ", mf);
             if (!mf->is_write()) { // L2 read 
                 // CTR access
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index bbf6acc1e..774cbbc6c 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -41,6 +41,7 @@ class mee {
         bool CTR_busy();
         bool MAC_busy();
         bool BMT_busy;
+        void pr(fifo_pipeline<mem_fetch> *m_META_RET_queue);
         
 
         

From 77991638b4044ef5288e5849e656c3ceef73da77 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Wed, 21 Aug 2024 02:59:59 +0800
Subject: [PATCH 098/133] mee v0.4

---
 src/gpgpu-sim/mee.cc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 14556e03c..493be66ec 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -11,11 +11,11 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
     m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
     m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 80);
+    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
     m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
     m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 80);
+    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
     m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
     m_OTP_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
@@ -24,9 +24,9 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_MAC_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
     m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
-    m_BMT_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 180);
-    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 80);
-    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, 80);
+    m_BMT_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 18);
+    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
 
     BMT_busy = false;
 }
@@ -283,18 +283,18 @@ void mee::BMT_CHECK_cycle() {
             // printf("BBBBBB");
             //计算下一层BMT
             if (!mf || get_BMT_Layer(mf->get_addr()) == 5) {
-                printf("AAAAAAAAAAAA\n");
+                // printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
                 cnt--;
             } else if (get_BMT_Layer(mf->get_addr()) == 4) {
-                printf("AAAAAAAAAAAA\n");
+                // printf("AAAAAAAAAAAA\n");
                 assert(!m_BMT_CHECK_queue->full());
                 m_BMT_CHECK_queue->push(BMT_ROOT_mf);
                 m_BMT_table[(new_addr_type) BMT_ROOT_mf] = (new_addr_type) mf;
                 assert(!m_BMT_HASH_queue->full());
                 m_BMT_HASH_queue->push(mf);
             } else {
-                printf("XXXXXXXXXXXXX\n");
+                // printf("XXXXXXXXXXXXX\n");
                 assert(get_BMT_Layer(mf->get_addr()) && get_BMT_Layer(mf->get_addr())<4);
                 assert(!m_BMT_queue->full(2));
                 gen_BMT_mf(mf->get_original_mf(), false, META_RBW, 128);
@@ -536,8 +536,8 @@ void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fet
     if (m_METAcache->access_ready() && !m_META_RET_queue->full()) {
         mem_fetch *mf = m_METAcache->next_access();
         m_META_RET_queue->push(mf);
-        if (m_METAcache == m_BMTcache)
-            print_addr("fill responses:", mf);
+        // if (m_METAcache == m_BMTcache)
+        //     print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
         // delete mf;
     }
@@ -551,8 +551,8 @@ void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_MET
         if (m_METAcache->fill_port_free()) {
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                     m_memcpy_cycle_offset);
-            if (m_METAcache == m_BMTcache)
-                print_addr("fill:\t\t\t\t", mf);
+            // if (m_METAcache == m_BMTcache)
+            //     print_addr("fill:\t\t\t\t", mf);
                 // printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE); 
             // if (mf->get_sub_partition_id() == 1) { 
             //     printf("CTR Fill: %p\n", mf);

From 21618223fc6eeec9141dde56468c7590c8a8013a Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Wed, 21 Aug 2024 17:47:14 +0800
Subject: [PATCH 099/133] mee v1.0

---
 .../SM7_QV100/base/gpgpusim.config_base       |  4 +-
 src/gpgpu-sim/gpu-sim.cc                      | 65 +++++++++++++++++++
 src/gpgpu-sim/gpu-sim.h                       |  1 +
 src/gpgpu-sim/l2cache.cc                      | 37 +++++++++++
 src/gpgpu-sim/l2cache.h                       |  2 +
 5 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base b/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
index a40bdc3e4..89df77532 100644
--- a/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
+++ b/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 5000000
+-gpgpu_max_cycle 4000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
@@ -163,7 +163,7 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dmeta N:16:128:16,L:B:m:W:P,A:192:4,32:0,32
+-gpgpu_cache:dmeta N:1:128:16,L:B:m:W:X,A:4:1,32:0,32
 # -gpgpu_cache:dmeta S:16:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index c4000c148..5a1108e09 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1351,6 +1351,65 @@ void gpgpu_sim::clear_executed_kernel_info() {
   m_executed_kernel_names.clear();
   m_executed_kernel_uids.clear();
 }
+
+void gpgpu_sim::gpu_print_METACache_stat(char META[]) {
+  if (!m_memory_config->m_META_config.disabled()) {
+    cache_stats l2_stats;
+    struct cache_sub_stats l2_css;
+    struct cache_sub_stats total_l2_css;
+    l2_stats.clear();
+    l2_css.clear();
+    total_l2_css.clear();
+
+    printf("\n========= %s cache stats =========\n", META);
+    
+    for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+      m_memory_partition_unit[i]->accumulate_METAcache_stats(l2_stats, META);
+      m_memory_partition_unit[i]->get_METAcache_sub_stats(l2_css, META);
+
+      fprintf(stdout,
+              "%s_cache_bank[%d]: Access = %llu, Miss = %llu, Miss_rate = "
+              "%.3lf, Pending_hits = %llu, Reservation_fails = %llu\n",
+              META, i, l2_css.accesses, l2_css.misses,
+              (double)l2_css.misses / (double)l2_css.accesses,
+              l2_css.pending_hits, l2_css.res_fails);
+
+      total_l2_css += l2_css;
+    }
+    
+    if (!m_memory_config->m_META_config.disabled() &&
+        m_memory_config->m_META_config.get_num_lines()) {
+      // L2c_print_cache_stat();
+      printf("%s_total_cache_accesses = %llu\n", META, total_l2_css.accesses);
+      printf("%s_total_cache_misses = %llu\n", META, total_l2_css.misses);
+      if (total_l2_css.accesses > 0)
+        printf("%s_total_cache_miss_rate = %.4lf\n",
+               META, (double)total_l2_css.misses / (double)total_l2_css.accesses);
+      printf("%s_total_cache_pending_hits = %llu\n", META, total_l2_css.pending_hits);
+      printf("%s_total_cache_reservation_fails = %llu\n",
+             META, total_l2_css.res_fails);
+      printf("%s_total_cache_breakdown:\n", META);
+
+      char META_cache_stats_breakdown[128];
+      strcpy(META_cache_stats_breakdown, META);
+      strcat(META_cache_stats_breakdown, "_cache_stats_breakdown");
+      l2_stats.print_stats(stdout, META_cache_stats_breakdown);
+      
+      printf("%s_total_cache_reservation_fail_breakdown:\n", META);
+      
+      char META_cache_stats_fail_breakdown[128];
+      strcpy(META_cache_stats_fail_breakdown, META);
+      strcat(META_cache_stats_fail_breakdown, "_cache_stats_fail_breakdown");
+      l2_stats.print_fail_stats(stdout, "L2_cache_stats_fail_breakdown");
+
+      char META_cache[128];
+      strcpy(META_cache, META);
+      strcat(META_cache, "_cache");
+      total_l2_css.print_port_stats(stdout, META_cache);
+    }
+  }
+}
+
 void gpgpu_sim::gpu_print_stat() {
   FILE *statfout = stdout;
 
@@ -1501,6 +1560,12 @@ void gpgpu_sim::gpu_print_stat() {
       total_l2_css.print_port_stats(stdout, "L2_cache");
     }
   }
+  // CTR cache stats
+  gpu_print_METACache_stat("CTR");
+  // MAC cache stats
+  gpu_print_METACache_stat("MAC");
+  // BMT cache stats
+  gpu_print_METACache_stat("BMT");
 
   if (m_config.gpgpu_cflog_interval != 0) {
     spill_log_to_file(stdout, 1, gpu_sim_cycle);
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 731f5e8e6..56159c4d3 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -570,6 +570,7 @@ class gpgpu_sim : public gpgpu_t {
   void decrement_kernel_latency();
 
   const gpgpu_sim_config &get_config() const { return m_config; }
+  void gpu_print_METACache_stat(char META[]);
   void gpu_print_stat();
   void dump_pipeline(int mask, int s, int m) const;
 
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 133ab94b1..51c7b642e 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -456,6 +456,43 @@ void memory_partition_unit::print(FILE *fp) const {
   m_dram->print(fp);
 }
 
+void memory_partition_unit::accumulate_METAcache_stats(
+    class cache_stats &l2_stats, char META[]) const {
+  class l2_cache *m_METAcache;
+  if (strcmp(META, "CTR") == 0) {
+    m_METAcache = m_CTRcache;
+  } else if (strcmp(META, "MAC") == 0) {
+    m_METAcache = m_MACcache;
+  } else if (strcmp(META, "BMT") == 0) {
+    m_METAcache = m_BMTcache;
+  } else {
+    // 如果 s 不是预期的值,可以在这里添加错误处理逻辑
+    assert(0);
+  }
+  if (!m_config->m_META_config.disabled()) {
+    l2_stats += m_METAcache->get_stats();
+  }
+}
+
+void memory_partition_unit::get_METAcache_sub_stats(
+    struct cache_sub_stats &css, char META[]) const {
+  class l2_cache *m_METAcache;
+  if (strcmp(META, "CTR") == 0) {
+    m_METAcache = m_CTRcache;
+  } else if (strcmp(META, "MAC") == 0) {
+    m_METAcache = m_MACcache;
+  } else if (strcmp(META, "BMT") == 0) {
+    m_METAcache = m_BMTcache;
+  } else {
+    // 如果 s 不是预期的值,可以在这里添加错误处理逻辑
+    assert(0);
+  }
+  if (!m_config->m_META_config.disabled()) {
+    m_METAcache->get_sub_stats(css);
+  }
+}
+
+
 memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
                                            const memory_config *config,
                                            class memory_stats_t *stats,
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 6c982bab4..ab36c0aab 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -87,6 +87,8 @@ class memory_partition_unit {
   void print_stat(FILE *fp) { m_dram->print_stat(fp); }
   void visualize() const { m_dram->visualize(); }
   void print(FILE *fp) const;
+  void accumulate_METAcache_stats(class cache_stats &l2_stats, char META[]) const;
+  void get_METAcache_sub_stats(struct cache_sub_stats &css, char META[]) const;
   void handle_memcpy_to_gpu(size_t dst_start_addr, unsigned subpart_id,
                             mem_access_sector_mask_t mask);
 

From a475d07cf7090c09b6bf973369a22f050e16d78d Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Wed, 21 Aug 2024 18:11:53 +0800
Subject: [PATCH 100/133] mee v1.0

---
 .../SM7_QV100/{base => base_mee}/accelwattch_ptx_sim.xml          | 0
 .../SM7_QV100/{base => base_mee}/accelwattch_ptx_sim_alt.xml      | 0
 .../SM7_QV100/{base => base_mee}/accelwattch_sass_hw.xml          | 0
 .../SM7_QV100/{base => base_mee}/accelwattch_sass_hybrid.xml      | 0
 .../SM7_QV100/{base => base_mee}/accelwattch_sass_sim.xml         | 0
 .../SM7_QV100/{base => base_mee}/accelwattch_sass_sim_alt.xml     | 0
 .../SM7_QV100/{base => base_mee}/config_volta_islip.icnt          | 0
 .../gpgpusim.config_base => base_mee/gpgpusim.config_base_mee}    | 0
 8 files changed, 0 insertions(+), 0 deletions(-)
 rename configs/tested-cfgs/SM7_QV100/{base => base_mee}/accelwattch_ptx_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base => base_mee}/accelwattch_ptx_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base => base_mee}/accelwattch_sass_hw.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base => base_mee}/accelwattch_sass_hybrid.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base => base_mee}/accelwattch_sass_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base => base_mee}/accelwattch_sass_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base => base_mee}/config_volta_islip.icnt (100%)
 rename configs/tested-cfgs/SM7_QV100/{base/gpgpusim.config_base => base_mee/gpgpusim.config_base_mee} (100%)

diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_ptx_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_ptx_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_ptx_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base/accelwattch_ptx_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_ptx_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_hw.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hw.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_hw.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_hybrid.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_hybrid.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_hybrid.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base/accelwattch_sass_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee/config_volta_islip.icnt
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base/config_volta_islip.icnt
rename to configs/tested-cfgs/SM7_QV100/base_mee/config_volta_islip.icnt
diff --git a/configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base/gpgpusim.config_base
rename to configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee

From 9016f5033c7f0bbcf517e1f1e72c2de4956fa616 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Thu, 22 Aug 2024 18:09:36 +0800
Subject: [PATCH 101/133] mee v1.0

---
 configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index 89df77532..bec1f5dff 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -163,7 +163,7 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dmeta N:1:128:16,L:B:m:W:X,A:4:1,32:0,32
+-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 # -gpgpu_cache:dmeta S:16:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64

From 77fa0ec19617cd9428b82ff8c4a89da33f276557 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 24 Aug 2024 15:51:13 +0800
Subject: [PATCH 102/133] mee v1.0.1

---
 src/gpgpu-sim/mee.cc       | 264 +++++++++++++++++++++++--------------
 src/gpgpu-sim/mee.h        |  16 ++-
 src/gpgpu-sim/mem_fetch.cc |   1 +
 src/gpgpu-sim/mem_fetch.h  |   6 +
 4 files changed, 180 insertions(+), 107 deletions(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 493be66ec..bec5df023 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -8,25 +8,25 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
-    m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
 
-    m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 640);
+    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
 
-    m_OTP_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
-    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 68);
+    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 6400);
 
-    m_MAC_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 10, 18);
-    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_MAC_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 68);
+    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
 
-    m_BMT_HASH_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 18);
-    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
-    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, 8);
+    m_BMT_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 68);
+    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
 
     BMT_busy = false;
 }
@@ -34,7 +34,7 @@ int decode(int addr) {
     return (addr & 16128) >> 8;
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
-    if (mf->get_sub_partition_id() == 1) {
+    if (mf->get_sub_partition_id() == 24 || mf->get_sub_partition_id() == 25 || true) {
         printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\tmask_id: %d\tmask_addr:%x\taccess type:%d\n", s, mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), get_sub_partition_id(mf), get_partition_addr(mf), mf->get_access_type());
         // print_tag();
     }
@@ -178,17 +178,18 @@ void mee::CT_cycle() {
             }
         } else if (!m_AES_queue->full() && !m_MAC_HASH_queue->full()) {              // read
             m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
-            m_MAC_HASH_queue->push(mf_return);  //对密文进行hash，用于MAC Check
+            m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return->get_original_mf()]));  //对密文进行hash，用于MAC Check
             m_Ciphertext_RET_queue->pop();
         }
     }
 
     if (!m_Ciphertext_queue->empty()) {
         mem_fetch *mf = m_Ciphertext_queue->top();
-        if (mf->is_write()) { // write
+        if (mf->is_write() && mf->is_raw()) { // write
             if (!m_AES_queue->full()) {
                 m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
-                m_Ciphertext_queue->pop();
+                mf->set_cooked_status();
+                // m_Ciphertext_queue->pop();
             }
         } else if (!m_unit->mee_dram_queue_full()) {              // read
             m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
@@ -201,34 +202,43 @@ void mee::AES_cycle() {
     if (!m_AES_queue->empty()) {
         mem_fetch *mf = m_AES_queue->top();
         new_addr_type REQ_addr = (new_addr_type) mf;    //加密/解密请求的明文/密文
-        new_addr_type OTP_addr = m_OTP_table[REQ_addr]; //OTP
+        unsigned OTP_id = m_OTP_table[REQ_addr]; //OTP
         int spid = m_unit->global_sub_partition_id_to_local_id(mf->get_sub_partition_id());
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
-        if (m_OTP_set[OTP_addr]) {  // 得到了OTP和明文/密文，AES加密/解密完成 
+        if (m_OTP_set[OTP_id]) {  // 得到了OTP和明文/密文，AES加密/解密完成 
             if (mf->is_write()) {   //加密
                 if (!m_unit->mee_dram_queue_full() && !m_MAC_HASH_queue->full()) {
-                    m_OTP_set[OTP_addr]--;
+                    m_OTP_set[OTP_id]--;
                     m_OTP_table[REQ_addr] = 0;
                     m_unit->mee_dram_queue_push(mf);    //加密完后更新DRAM中的密文
-                    m_MAC_HASH_queue->push(mf);         //加密完后得到密文，对密文进行MAC Hash
+                    m_MAC_table[(new_addr_type)mf] = ++MAC_couter;
+                    assert(m_MAC_table[(new_addr_type)mf]);
+                    m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf]));         //加密完后得到密文，对密文进行MAC Hash
                     m_AES_queue->pop();
+                    m_unit->L2_mee_queue_pop(spid);
                 }
             } else if (!m_unit->mee_L2_queue_full(spid)) {  //解密
-                m_OTP_set[OTP_addr]--;
+                m_OTP_set[OTP_id]--;
                 m_OTP_table[REQ_addr] = 0;
                 // print_addr("mee to L2 R:\t", mf);
                 m_unit->mee_L2_queue_push(spid, mf);    //解密完后返回L2
+                // printf("JJJJJJJJJJJJJJJJJJJJJJJJJ");
                 m_AES_queue->pop();
                 
+            } else {
+                printf("IIIIIIIIIIIIIIII\n");
             }
+        } else {
+            // if (mf->get_sub_partition_id() == 24) 
+                printf("%p %d AES waiting for OTP %d\n", mf, mf->get_sub_partition_id(), OTP_id);
         }
     }
 
     if (!m_OTP_queue->empty()){
-        mem_fetch *mf = m_OTP_queue->top();
+        unsigned *mf = m_OTP_queue->top();
         if (mf) {
-            m_OTP_set[(new_addr_type)mf]++; //OTP计算完成
+            m_OTP_set[*mf]++; //OTP计算完成
         }
         // delete mf;
         m_OTP_queue->pop();
@@ -239,22 +249,26 @@ void mee::MAC_CHECK_cycle() {
     if (!m_MAC_CHECK_queue->empty()) {
         // printf("AAAAAAAAAAAAA\n");
         mem_fetch *mf = m_MAC_CHECK_queue->top();
-        new_addr_type REQ_addr = (new_addr_type) mf;    //MAC Cache的值
-        new_addr_type HASH_addr = m_MAC_table[REQ_addr];    //MAC Hash值
+        new_addr_type REQ_addr = (new_addr_type) mf->get_original_mf();    //MAC Cache的值
+        unsigned HASH_id = m_MAC_table[REQ_addr];    //MAC Hash值
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
-        if (m_MAC_set[HASH_addr]) { //得到了MAC与Hash值，MAC Check完成
-            m_MAC_set[HASH_addr]--;
+        if (m_MAC_set[HASH_id]) { //得到了MAC与Hash值，MAC Check完成
+            m_MAC_set[HASH_id]--;
             m_MAC_table[REQ_addr] = 0;
             m_MAC_CHECK_queue->pop();
+            printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
+        } else {
+            // if (mf->get_sub_partition_id() == 32) 
+                printf("%p %d MAC waiting for HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         }
     }
 
     if (!m_MAC_HASH_queue->empty()) {
         // printf("BBBBBBBBBBBBBBB\n");
-        mem_fetch *mf = m_MAC_HASH_queue->top();
+        unsigned *mf = m_MAC_HASH_queue->top();
         if (mf) {
-            m_MAC_set[(new_addr_type)mf]++; //MAC Hash计算完成
+            m_MAC_set[*mf]++; //MAC Hash计算完成
         }
         // delete mf;
         m_MAC_HASH_queue->pop();
@@ -266,12 +280,12 @@ void mee::BMT_CHECK_cycle() {
         // printf("AAAAAAAAAAAAA\n");
         mem_fetch *mf = m_BMT_CHECK_queue->top();
         new_addr_type REQ_addr = (new_addr_type) mf;    //BMT Cache的值
-        new_addr_type HASH_addr = m_BMT_table[REQ_addr];    //BMT Hash值
+        unsigned HASH_id = m_BMT_table[REQ_addr];    //BMT Hash值
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
         // assert(mf);
-        if (m_BMT_set[HASH_addr] && !m_BMT_queue->full(2) && !m_BMT_HASH_queue->full()) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
-            m_BMT_set[HASH_addr]--;
+        if (m_BMT_set[HASH_id] && !m_BMT_queue->full(2) && !m_BMT_HASH_queue->full()) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
+            m_BMT_set[HASH_id]--;
             m_BMT_table[REQ_addr] = 0;
             m_BMT_table.erase(m_BMT_table.find(REQ_addr));
             m_BMT_CHECK_queue->pop();
@@ -290,9 +304,9 @@ void mee::BMT_CHECK_cycle() {
                 // printf("AAAAAAAAAAAA\n");
                 assert(!m_BMT_CHECK_queue->full());
                 m_BMT_CHECK_queue->push(BMT_ROOT_mf);
-                m_BMT_table[(new_addr_type) BMT_ROOT_mf] = (new_addr_type) mf;
+                m_BMT_table[(new_addr_type) BMT_ROOT_mf] = ++BMT_couter;
                 assert(!m_BMT_HASH_queue->full());
-                m_BMT_HASH_queue->push(mf);
+                m_BMT_HASH_queue->push(new unsigned(BMT_couter));
             } else {
                 // printf("XXXXXXXXXXXXX\n");
                 assert(get_BMT_Layer(mf->get_addr()) && get_BMT_Layer(mf->get_addr())<4);
@@ -301,7 +315,7 @@ void mee::BMT_CHECK_cycle() {
                 assert(!m_BMT_queue->full());
                 gen_BMT_mf(mf, mf->is_write(), META_ACC, 8);
                 assert(!m_BMT_HASH_queue->full());
-                m_BMT_HASH_queue->push(mf);
+                // m_BMT_HASH_queue->push(m_BMT_table[(new_addr_type) mf]);
             }
             
             // if (REQ_addr == (new_addr_type) BMT_ROOT_mf) {
@@ -313,9 +327,9 @@ void mee::BMT_CHECK_cycle() {
 
     if (!m_BMT_HASH_queue->empty()) {
         // printf("BBBBBBBBBBBBBBB\n");
-        mem_fetch *mf = m_BMT_HASH_queue->top();
+        unsigned *mf = m_BMT_HASH_queue->top();
         if (mf) {
-            m_BMT_set[(new_addr_type)mf]++; //BMT Hash计算完成
+            m_BMT_set[*mf]++; //BMT Hash计算完成
         }
         // delete mf;
         m_BMT_HASH_queue->pop();
@@ -327,7 +341,7 @@ void mee::BMT_CHECK_cycle() {
         // assert(cnt);
         mem_fetch *mf = m_CTR_BMT_Buffer->top();
             gen_BMT_mf(mf, mf->is_write(), META_ACC, 8);
-            m_BMT_HASH_queue->push(mf);
+            // m_BMT_HASH_queue->push(mf);
             m_CTR_BMT_Buffer->pop();
             BMT_busy = true;
             cnt++;
@@ -350,8 +364,10 @@ void mee::CTR_cycle() {
             assert(!mf_return->is_write());
                 // print_addr("MISS OTP:\t\t", mf_return);
             if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
-                m_OTP_queue->push(mf_return);   //得到CTR值，计算OTP用于解密
-                m_CTR_BMT_Buffer->push(mf_return);
+                m_OTP_queue->push(new unsigned(m_OTP_table[(new_addr_type) mf_return->get_original_mf()]));   //得到CTR值，计算OTP用于解密
+                if (mf_return->get_sub_partition_id() == 24)
+                    printf("%p OTP MISS\n", mf_return->get_original_mf());
+                // m_CTR_BMT_Buffer->push(mf_return);
                 m_CTR_RET_queue->pop();
             }
         }
@@ -360,7 +376,7 @@ void mee::CTR_cycle() {
     m_CTRcache->cycle();
     
     bool output_full = m_OTP_queue->full() || m_CTR_RET_queue->full() || m_CTR_BMT_Buffer->full();
-    bool port_free = m_unit->m_MACcache->data_port_free();
+    bool port_free = m_unit->m_CTRcache->data_port_free();
 
     if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
         mem_fetch *mf = m_CTR_queue->top();
@@ -373,21 +389,27 @@ void mee::CTR_cycle() {
         }
 
         if (mf->get_access_type() != META_RBW) {
-            m_OTP_table[(new_addr_type)mf->get_original_mf()] = (new_addr_type)mf;  //生成<加密/解密, OTP>任务
+            m_OTP_table[(new_addr_type)mf->get_original_mf()] = ++OTP_couter;  //生成<加密/解密, OTP>任务
+            if (mf->get_sub_partition_id() == 24)
+                printf("ins <%p, %u>\n", mf->get_original_mf(), OTP_couter);
         }
 
         std::list<cache_event> events;
         enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
-        // print_addr("CTR cycle access:\t\t", mf);
+        if (mf->get_sub_partition_id() == 24 || mf->get_sub_partition_id() == 25)
+            printf("%d ", status);
+        print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
             // if (!m_OTP_queue->full()) {
             // print_addr("HIT OTP:\t\t", mf);
-            m_OTP_queue->push(mf);  //CTR HIT后计算OTP用于加密/解密
+            m_OTP_queue->push(new unsigned(OTP_couter));  //CTR HIT后计算OTP用于加密/解密
+            if (mf->get_sub_partition_id() == 24)
+                printf("%p OTP HIT\n", mf->get_original_mf());
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
-                m_CTR_BMT_Buffer->push(mf);
+                // m_CTR_BMT_Buffer->push(mf);
             }
             // }
         } else if (status != RESERVATION_FAIL) {
@@ -395,13 +417,17 @@ void mee::CTR_cycle() {
             // print_addr("CTR cycle access:\t\t", mf);
             m_CTR_queue->pop();
         } else {
-            // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
+            print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
             // if (get_sub_partition_id(mf) == 0)
             //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             assert(!write_sent);
             assert(!read_sent);
+            // printf("XXXXXXXXXXXXXXXXXXXXXXXXXX");
         }
-    } 
+    } else {
+        // if (!m_CTR_queue->empty())
+        //     printf("GGGGGGGGGGGGGGGGGGGGGGGG");
+    }
     // else if (mf->get_sub_partition_id() == 1) {
         // if (m_unit->mee_dram_queue_full()) printf("AAAAAAAAAAAAAA\n");
         // if (m_OTP_queue->full()) printf("BBBBBBBBBBBBBBBBB\n");
@@ -416,6 +442,7 @@ void mee::MAC_cycle() {
             m_MAC_RET_queue->pop();
             // delete mf_return;//删除2
         } else {    //MAC读MISS返回
+            assert(!mf_return->is_write());
             if (!m_MAC_CHECK_queue->full()) {
                 m_MAC_CHECK_queue->push(mf_return); //MAC读MISS完成，得到MAC值，发往MAC Check
                 m_MAC_RET_queue->pop();
@@ -433,11 +460,11 @@ void mee::MAC_cycle() {
         // print_addr("MAC cycle access:\t\t", mf);
 
         if (mf->is_write()) {   //对于写MAC请求，则应等待密文被Hash为新MAC值
-            if (!m_MAC_set[(new_addr_type)mf->get_original_mf()]) {
+            if (!m_MAC_set[m_MAC_table[(new_addr_type) mf->get_original_mf()]]) {
                 return;
             }
         } else {    //对于读MAC请求，生成<MAC，Hash(密文)>的MAC Check任务
-            m_MAC_table[(new_addr_type)mf] = (new_addr_type)mf->get_original_mf();
+            // m_MAC_table[(new_addr_type)mf->get_original_mf()] = ++MAC_couter;
         }
 
         std::list<cache_event> events;
@@ -449,7 +476,7 @@ void mee::MAC_cycle() {
             // if (!m_OTP_queue->full()) {
             // print_addr("HIT OTP:\t\t", mf);
             if (mf->is_write()) {   //MAC写HIT，则MAC Hash值使用结束
-                m_MAC_set[(new_addr_type)mf->get_original_mf()]--;
+                m_MAC_set[m_MAC_table[(new_addr_type) mf->get_original_mf()]]--;
             } else {
                 m_MAC_CHECK_queue->push(mf);    //MAC读HIT，得到MAC值，发往MAC Check
             }
@@ -459,13 +486,14 @@ void mee::MAC_cycle() {
             // set wating for CTR fill
             // print_addr("CTR cycle access:\t\t", mf);
             if (mf->is_write()) {   //MAC写MISS，则MAC Hash值使用结束
-                m_MAC_set[(new_addr_type)mf->get_original_mf()]--;
+                m_MAC_set[m_MAC_table[(new_addr_type) mf->get_original_mf()]]--;
             }
             m_MAC_queue->pop();
         } else {
             // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
             // if (get_sub_partition_id(mf) == 0)
             //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+            print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
             assert(!write_sent);
             assert(!read_sent);
         }
@@ -508,8 +536,10 @@ void mee::BMT_cycle() {
             }
         }
         
-        if (mf->get_access_type() != META_RBW)
-            m_BMT_table[(new_addr_type)mf] = (new_addr_type)mf->get_original_mf();
+        if (mf->get_access_type() != META_RBW) {
+            m_BMT_table[(new_addr_type)mf] = ++BMT_couter;
+            m_BMT_HASH_queue->push(new unsigned(BMT_couter));
+        }
 
         std::list<cache_event> events;
         enum cache_request_status status = m_BMTcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
@@ -537,17 +567,21 @@ void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fet
         mem_fetch *mf = m_METAcache->next_access();
         m_META_RET_queue->push(mf);
         // if (m_METAcache == m_BMTcache)
-        //     print_addr("fill responses:", mf);
+            print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
         // delete mf;
+    } else {
+        // if (mf->get_sub_partition_id() == 32 && m_META_RET_queue->full()){
+        //     print_addr("fill responses ERROR:", mf);
+        // }
     }
 }
 
 void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE) {
     // if (m_METAcache == m_BMTcache) printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE);
     
-    if (((mf->get_addr() & MASK) == BASE) && m_METAcache->waiting_for_fill(mf)) {
-        // print_addr("wating for fill:\t\t", mf); 
+    if (((mf->get_addr() & MASK) == BASE) || m_METAcache->waiting_for_fill(mf)) {
+        print_addr("wating for fill:\t\t", mf); 
         if (m_METAcache->fill_port_free()) {
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                     m_memcpy_cycle_offset);
@@ -568,14 +602,8 @@ void mee::pr(fifo_pipeline<mem_fetch> *m_META_RET_queue) {
 }
 
 void mee::simple_cycle(unsigned cycle) {
+    // printf("AAAAAAAAAAAAAAAAAAAAAA");
     // pr(m_CTR_BMT_Buffer);
-    MAC_CHECK_cycle();
-    MAC_cycle();
-    BMT_CHECK_cycle();
-    BMT_cycle();
-    AES_cycle();
-    CTR_cycle();
-    CT_cycle();
     // META Cache fill responses
     META_fill_responses(m_CTRcache, m_CTR_RET_queue, CTR_mask);
     META_fill_responses(m_MACcache, m_MAC_RET_queue, MAC_mask);
@@ -583,9 +611,11 @@ void mee::simple_cycle(unsigned cycle) {
         META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[layer]);
     }
     // META_fill_responses(m_BMTcache);
+
     // dram to mee
     if (!m_unit->dram_mee_queue_empty()) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top();
+        print_addr("Cipertext fill:", mf_return);
         // print_addr("dram_mee_queue_top:\t", mf_return);
         // mee to L2
         
@@ -606,9 +636,23 @@ void mee::simple_cycle(unsigned cycle) {
             // reply L2 read
             // reply L2 write back
             //m_unit->mee_L2_queue_push(m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id()), mf_return);
-            if (!m_Ciphertext_RET_queue->full()) {
-                m_Ciphertext_RET_queue->push(mf_return);
+            int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+            
+            if (mf_return->is_write()) { // write
+                // print_addr("mee to L2 W:\t", mf_return);
+                if (!m_unit->mee_L2_queue_full(spid)){
+                    m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
+                    m_unit->dram_mee_queue_pop();
+                }
+            } else if (!m_AES_queue->full() && !m_MAC_HASH_queue->full()) {              // read
+                m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
+                m_MAC_table[(new_addr_type)mf_return] = ++MAC_couter;
+                assert(m_MAC_table[(new_addr_type)mf_return]);
+                m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
                 m_unit->dram_mee_queue_pop();
+                // printf("HHHHHHHHHHHHHHHH");
+            } else {
+                // printf("HHHHHHHHHHHHHHHH");
             }
             // print_addr("mee to L2: ", mf_return);
         }
@@ -621,44 +665,52 @@ void mee::simple_cycle(unsigned cycle) {
         mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
         // print_addr("L2 to mee: ", mf);
         // mee to dram
-        if (!m_CTR_queue->full(2) && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
-            // print_addr("L2 to mee: ", mf);
-            if (!mf->is_write()) { // L2 read 
-                // CTR access
+        assert(mf->is_raw());
+        
+        if (!m_CTR_queue->full(2) && !m_MAC_queue->full()) {
+            assert(!mf->is_write());
+            if (mf->is_write()) { // write
+            printf("AAAAAAAAAAAAAAAAAAAAAA");
+                if (mf->is_raw() && !m_AES_queue->full()) {
+                    gen_CTR_mf(mf, META_RBW, false);
+                    gen_CTR_mf(mf, META_ACC, true);
+                    gen_MAC_mf(mf, true);
+                    m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
+                    mf->set_cooked_status();
+                    // printf("BBBBBBBBBBBBBBBBB");
+                }
+            } else if (!m_unit->mee_dram_queue_full()) {              // read
+                // printf("CCCCCCCCCCCCCCCC");
+                m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
                 gen_CTR_mf(mf, META_ACC, false);
-                // Ciphertext access
-                m_Ciphertext_queue->push(mf);
-                // MAC access
                 gen_MAC_mf(mf, false);
-                // AES Decryption
-                // AES_cycle();
-                // Hash MAC
-                
-                // MAC Check
-                // BMT Check
-            } else { // L2 write back
-                // CTR access
-                gen_CTR_mf(mf, META_RBW, false);
-                // CTR update
-                gen_CTR_mf(mf, META_ACC, true);
-                // AES Ecryption
-
-                // AES_queue.push(mf);
-                
-                // Ciphertext Update
-                m_Ciphertext_queue->push(mf);
-                // MAC access
-                // gen_MAC_mf(mf, false);
-                // MAC Hash
-                // MAC Update
-                gen_MAC_mf(mf, true);
-                // BMT Update
+                m_unit->L2_mee_queue_pop(cycle&1);
             }
-            
-            m_unit->L2_mee_queue_pop(cycle&1);
-            
+        } else {
+            if (m_unit->get_mpid() <= 16){
+                if (m_CTR_RET_queue->full())
+                    printf("AAAAAAAAAAAAAAAAAAAAAA");
+                if (m_MAC_RET_queue->full())
+                    printf("BBBBBBBBBBBBBBBBB");
+                if (m_BMT_RET_queue->full())
+                    printf("CCCCCCCCCCCC");
+                if (m_AES_queue->full())
+                    printf("DDDDDDDDDDDDDDDD");
+                // if (m_AES_queue->full())
+                //     printf("EEEEEEEEEEEEEEEE");
+                if (!m_unit->mee_dram_queue_empty())
+                    printf("FFFFFFFFFFFFFFFFFF");
+            }
+                
         }
     }
+    MAC_CHECK_cycle();
+    MAC_cycle();
+    BMT_CHECK_cycle();
+    BMT_cycle();
+    AES_cycle();
+    CTR_cycle();
+    // CT_cycle();
 }
 
 void mee::cycle(unsigned cycle) {
@@ -668,4 +720,12 @@ void mee::cycle(unsigned cycle) {
 //BMT buzy
 //BMT erase
 //BMT write需要阻塞，CTR read可以连续访问 
-//BMT 写前读 ok
\ No newline at end of file
+//BMT 写前读 ok
+//检查写操作
+//实现写密文阻塞
+//实现mf id匹配
+//增加密文队列
+//BMT不需要每层都Check
+//增加访存类型的属性
+//单个HASH单元
+//
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 774cbbc6c..6b1c9a106 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -62,16 +62,17 @@ class mee {
         fifo_pipeline<mem_fetch> *m_BMT_RET_queue;
         fifo_pipeline<mem_fetch> *m_Ciphertext_RET_queue;
 
-        fifo_pipeline<mem_fetch> *m_OTP_queue;
+        fifo_pipeline<unsigned> *m_OTP_queue;
         fifo_pipeline<mem_fetch> *m_AES_queue;
-        fifo_pipeline<mem_fetch> *m_MAC_HASH_queue;
+        
+        fifo_pipeline<unsigned> *m_MAC_HASH_queue;
         fifo_pipeline<mem_fetch> *m_MAC_CHECK_queue;
 
         //m_CTR_BMT_Buffer-->m_BMT_CHECK_queue--|-->
         //                |->m_BMT_HASH_queue---|
         //              m_BMT_queue-->m_BMT_RET_queue-->
         fifo_pipeline<mem_fetch> *m_BMT_CHECK_queue;
-        fifo_pipeline<mem_fetch> *m_BMT_HASH_queue;
+        fifo_pipeline<unsigned> *m_BMT_HASH_queue;
         fifo_pipeline<mem_fetch> *m_CTR_BMT_Buffer;
 
         //CTR: 1111 1110 0000 0000 0000 0000 0000 0000
@@ -99,8 +100,8 @@ class mee {
         const int m_memcpy_cycle_offset = 0;
         const int mee_busy_mask = 0;
 
-        typedef tr1_hash_map<new_addr_type, new_addr_type> table;
-        typedef tr1_hash_map<new_addr_type, int> set;
+        typedef tr1_hash_map<new_addr_type, unsigned> table;
+        typedef tr1_hash_map<unsigned, int> set;
         table m_OTP_table;  //<密文，OTP(CTR)>
         set m_OTP_set;  //<OTP(CTR), cnt>
         table m_MAC_table;  //<MAC, hash(密文)>
@@ -111,4 +112,9 @@ class mee {
         mem_fetch *BMT_ROOT_mf = NULL;
         int cnt = 0;
 
+        unsigned OTP_couter = 0;
+        unsigned MAC_couter = 0;
+        unsigned BMT_couter = 0;
+
+
 };
\ No newline at end of file
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index 456d891dd..a2a7aced2 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -69,6 +69,7 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
     m_raw_addr.chip = m_original_mf->get_tlx_addr().chip;
     m_raw_addr.sub_partition = m_original_mf->get_tlx_addr().sub_partition;
   }
+  raw_data = true;
 }
 
 mem_fetch::~mem_fetch() { m_status = MEM_FETCH_DELETED; }
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index e039846e3..905b7cf92 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -128,6 +128,10 @@ class mem_fetch {
   mem_fetch *get_original_mf() { return original_mf; }
   mem_fetch *get_original_wr_mf() { return original_wr_mf; }
 
+  bool is_raw() {return raw_data; }
+  
+  void set_cooked_status() {raw_data = false; }
+
  private:
   // request source information
   unsigned m_request_uid;
@@ -174,6 +178,8 @@ class mem_fetch {
                      // size), so the pointer refers to the original request
   mem_fetch *original_wr_mf;  // this pointer refers to the original write req,
                               // when fetch-on-write policy is used
+  bool raw_data = true;
+
 };
 
 #endif

From 3a3b2d2a0b68d4c980721c67a920f5fb2724c14f Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 25 Aug 2024 03:49:28 +0800
Subject: [PATCH 103/133] mee v1.1

---
 .../base_mee/gpgpusim.config_base_mee         |   4 +-
 src/gpgpu-sim/gpu-cache.cc                    |   3 +
 src/gpgpu-sim/l2cache.cc                      |   4 +
 src/gpgpu-sim/mee.cc                          | 326 +++++++++++-------
 src/gpgpu-sim/mee.h                           |  20 +-
 src/gpgpu-sim/mem_fetch.cc                    |   1 +
 src/gpgpu-sim/mem_fetch.h                     |  17 +-
 7 files changed, 233 insertions(+), 142 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index bec1f5dff..394874c8f 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -150,6 +150,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+#-gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
@@ -163,8 +164,9 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
-# -gpgpu_cache:dmeta S:16:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 373fa4b5f..c8e65e831 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1084,6 +1084,9 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   }
 
   extra_mf_fields_lookup::iterator e = m_extra_mf_fields.find(mf);
+  if (e == m_extra_mf_fields.end()) {
+    printf("XXXXXXXXX%x %dXXXXXXXXXXXXXXX\n", mf->get_addr(), mf->get_access_type());
+  }
   assert(e != m_extra_mf_fields.end());
   assert(e->second.m_valid);
   mf->set_data_size(e->second.m_data_size);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 51c7b642e..7b4a0bbcd 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -558,6 +558,8 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   if (!m_config->m_L2_config.disabled()) {
     if (m_L2cache->access_ready() && !m_L2_icnt_queue->full()) {
       mem_fetch *mf = m_L2cache->next_access();
+              printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
       if (mf->get_access_type() !=
           L2_WR_ALLOC_R) {  // Don't pass write allocate read request back to
                             // upper level cache
@@ -584,6 +586,8 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   // DRAM to L2 (texture) and icnt (not texture)
   if (!m_mee_L2_queue->empty()) {
     mem_fetch *mf = m_mee_L2_queue->top();
+            printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
     if (!m_config->m_L2_config.disabled() && m_L2cache->waiting_for_fill(mf)) {
       if (m_L2cache->fill_port_free()) {
         mf->set_status(IN_PARTITION_L2_FILL_QUEUE,
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index bec5df023..d0a126448 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -8,25 +8,26 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
-    m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    unsigned len = 8;
+    m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
-    m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 640);
-    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
-    m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 68);
-    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 6400);
+    m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
+    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
-    m_MAC_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 68);
-    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_MAC_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
+    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
-    m_BMT_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 68);
-    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
-    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, 64);
+    m_BMT_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
+    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
     BMT_busy = false;
 }
@@ -90,7 +91,7 @@ new_addr_type mee::get_addr(new_addr_type sub_partition_id, new_addr_type partit
     return new_addr;
 }
 
-void mee::gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr) {
+void mee::gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr, unsigned mf_id) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
     partition_addr = partition_addr >> 14 << 7;
@@ -99,10 +100,10 @@ void mee::gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr) {
 
     meta_access(m_CTR_queue, CTR_addr, meta_acc, 
             128, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, CTR);
 }
 
-void mee::gen_MAC_mf(mem_fetch *mf, bool wr) {
+void mee::gen_MAC_mf(mem_fetch *mf, bool wr, unsigned mf_id) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
     partition_addr = partition_addr >> 7 << 3;
@@ -111,10 +112,10 @@ void mee::gen_MAC_mf(mem_fetch *mf, bool wr) {
 
     meta_access(m_MAC_queue, MAC_addr, META_ACC, 
             8, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, MAC);
 }
 
-void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size) {
+void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size, unsigned mf_id) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
     unsigned int Layer = get_BMT_Layer(mf->get_addr());
@@ -129,13 +130,13 @@ void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size
 
     meta_access(m_BMT_queue, BMT_addr, type, 
             size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, BMT);
 }
 
 void mee::meta_access(
         fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr,
         unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
-        mem_fetch *original_mf) const {
+        mem_fetch *original_mf, unsigned mf_id, enum data_type m_data_type) const {
 
     mem_access_byte_mask_t byte_mask;
     mem_access_sector_mask_t sector_mask;
@@ -161,6 +162,8 @@ void mee::meta_access(
     for (unsigned i = 0; i < reqs.size(); ++i) {
         assert(reqs.size() == 1);
         mem_fetch *req = reqs[i];
+        reqs[i]->set_id(mf_id);
+        reqs[i]->set_data_type(m_data_type);
         assert(!m_META_queue->full());
         m_META_queue->push(req);
     }
@@ -170,30 +173,49 @@ void mee::CT_cycle() {
     if (!m_Ciphertext_RET_queue->empty()) {
         mem_fetch *mf_return = m_Ciphertext_RET_queue->top();
         int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+        // if (mf_return->get_access_type() != L1_WR_ALLOC_R && mf_return->get_access_type() != L2_WR_ALLOC_R) {
         if (mf_return->is_write()) { // write
             // print_addr("mee to L2 W:\t", mf_return);
-            if (!m_unit->mee_L2_queue_full(spid)){
-                m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
+            // if (!m_unit->mee_L2_queue_full(spid)){
+            //     m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
                 m_Ciphertext_RET_queue->pop();
-            }
+            // }
         } else if (!m_AES_queue->full() && !m_MAC_HASH_queue->full()) {              // read
             m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
-            m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return->get_original_mf()]));  //对密文进行hash，用于MAC Check
+            // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
+            // assert(m_MAC_table[(new_addr_type)mf_return]);
+            m_MAC_HASH_queue->push(new unsigned(mf_return->get_id()));         //从DRAM中取到密文，对密文进行MAC Hash
+            // m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
             m_Ciphertext_RET_queue->pop();
         }
+        // } else {
+        //     m_Ciphertext_RET_queue->pop();
+        // }
     }
 
-    if (!m_Ciphertext_queue->empty()) {
+    if (!m_Ciphertext_queue->empty() && CT_counter < OTP_counter) {
         mem_fetch *mf = m_Ciphertext_queue->top();
-        if (mf->is_write() && mf->is_raw()) { // write
-            if (!m_AES_queue->full()) {
+        if (mf->is_write()) { // write
+            if (mf->is_raw() && !m_AES_queue->full()) {
+                printf("QQQQQQQQQQQQQQQQ\n");
                 m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
                 mf->set_cooked_status();
-                // m_Ciphertext_queue->pop();
+                // m_MAC_table[(new_addr_type)mf] = ++MAC_counter;
+                // assert(m_MAC_table[(new_addr_type)mf]);
+                // m_MAC_HASH_queue->push(new unsigned(mf->get_id()));         //加密完后得到密文，对密文进行MAC Hash
+                // m_Ciphertext_queue->pop();   //加密完后才可以生成访存
+            } else {
+                if (!mf->is_raw()) {
+                    printf("RRRRRRRRRRRRRRR");
+                }
+                if (m_AES_queue->full()) {
+                    printf("SSSSSSSSSSSSSSSSSSS");
+                }
             }
         } else if (!m_unit->mee_dram_queue_full()) {              // read
             m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
             m_Ciphertext_queue->pop();
+            CT_counter++;
         }
     }
 }
@@ -202,25 +224,32 @@ void mee::AES_cycle() {
     if (!m_AES_queue->empty()) {
         mem_fetch *mf = m_AES_queue->top();
         new_addr_type REQ_addr = (new_addr_type) mf;    //加密/解密请求的明文/密文
-        unsigned OTP_id = m_OTP_table[REQ_addr]; //OTP
+        unsigned OTP_id = mf->get_id(); //OTP
         int spid = m_unit->global_sub_partition_id_to_local_id(mf->get_sub_partition_id());
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
+        assert(OTP_id);
+        if (mf->is_write())
+            printf("PPPPPPPPPPPPPP\n");
         if (m_OTP_set[OTP_id]) {  // 得到了OTP和明文/密文，AES加密/解密完成 
             if (mf->is_write()) {   //加密
+                printf("OOOOOOOOOOOOOOOOOOOOOO\n");
                 if (!m_unit->mee_dram_queue_full() && !m_MAC_HASH_queue->full()) {
                     m_OTP_set[OTP_id]--;
-                    m_OTP_table[REQ_addr] = 0;
+                    // m_OTP_table[REQ_addr] = 0;
                     m_unit->mee_dram_queue_push(mf);    //加密完后更新DRAM中的密文
-                    m_MAC_table[(new_addr_type)mf] = ++MAC_couter;
-                    assert(m_MAC_table[(new_addr_type)mf]);
-                    m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf]));         //加密完后得到密文，对密文进行MAC Hash
+                    CT_counter++;
+                    // m_MAC_table[(new_addr_type)mf] = ++MAC_counter;
+                    // assert(m_MAC_table[(new_addr_type)mf]);
+                    m_MAC_HASH_queue->push(new unsigned(mf->get_id()));          //加密完后得到密文，对密文进行MAC Hash
                     m_AES_queue->pop();
-                    m_unit->L2_mee_queue_pop(spid);
+                    // m_unit->L2_mee_queue_pop(spid);
+                    m_Ciphertext_queue->pop();  //写密文发往DRAM
+                    printf("NNNNNNNNNNNNNNNNNNNNN\n");
                 }
             } else if (!m_unit->mee_L2_queue_full(spid)) {  //解密
                 m_OTP_set[OTP_id]--;
-                m_OTP_table[REQ_addr] = 0;
+                // m_OTP_table[REQ_addr] = 0;
                 // print_addr("mee to L2 R:\t", mf);
                 m_unit->mee_L2_queue_push(spid, mf);    //解密完后返回L2
                 // printf("JJJJJJJJJJJJJJJJJJJJJJJJJ");
@@ -230,7 +259,7 @@ void mee::AES_cycle() {
                 printf("IIIIIIIIIIIIIIII\n");
             }
         } else {
-            // if (mf->get_sub_partition_id() == 24) 
+            if (mf->is_write()) 
                 printf("%p %d AES waiting for OTP %d\n", mf, mf->get_sub_partition_id(), OTP_id);
         }
     }
@@ -250,17 +279,18 @@ void mee::MAC_CHECK_cycle() {
         // printf("AAAAAAAAAAAAA\n");
         mem_fetch *mf = m_MAC_CHECK_queue->top();
         new_addr_type REQ_addr = (new_addr_type) mf->get_original_mf();    //MAC Cache的值
-        unsigned HASH_id = m_MAC_table[REQ_addr];    //MAC Hash值
+        unsigned HASH_id = mf->get_id();    //MAC Hash值
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
+        assert(HASH_id);
         if (m_MAC_set[HASH_id]) { //得到了MAC与Hash值，MAC Check完成
             m_MAC_set[HASH_id]--;
-            m_MAC_table[REQ_addr] = 0;
+            // m_MAC_table[REQ_addr] = 0;
             m_MAC_CHECK_queue->pop();
             printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         } else {
             // if (mf->get_sub_partition_id() == 32) 
-                printf("%p %d MAC waiting for HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
+                // printf("%p %d MAC waiting for HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         }
     }
 
@@ -304,16 +334,16 @@ void mee::BMT_CHECK_cycle() {
                 // printf("AAAAAAAAAAAA\n");
                 assert(!m_BMT_CHECK_queue->full());
                 m_BMT_CHECK_queue->push(BMT_ROOT_mf);
-                m_BMT_table[(new_addr_type) BMT_ROOT_mf] = ++BMT_couter;
+                m_BMT_table[(new_addr_type) BMT_ROOT_mf] = ++BMT_counter;
                 assert(!m_BMT_HASH_queue->full());
-                m_BMT_HASH_queue->push(new unsigned(BMT_couter));
+                m_BMT_HASH_queue->push(new unsigned(BMT_counter));
             } else {
                 // printf("XXXXXXXXXXXXX\n");
                 assert(get_BMT_Layer(mf->get_addr()) && get_BMT_Layer(mf->get_addr())<4);
                 assert(!m_BMT_queue->full(2));
-                gen_BMT_mf(mf->get_original_mf(), false, META_RBW, 128);
+                gen_BMT_mf(mf->get_original_mf(), false, META_RBW, 128, 0);
                 assert(!m_BMT_queue->full());
-                gen_BMT_mf(mf, mf->is_write(), META_ACC, 8);
+                gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, 0);
                 assert(!m_BMT_HASH_queue->full());
                 // m_BMT_HASH_queue->push(m_BMT_table[(new_addr_type) mf]);
             }
@@ -340,7 +370,7 @@ void mee::BMT_CHECK_cycle() {
         assert(cnt==0);
         // assert(cnt);
         mem_fetch *mf = m_CTR_BMT_Buffer->top();
-            gen_BMT_mf(mf, mf->is_write(), META_ACC, 8);
+            gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, 0);
             // m_BMT_HASH_queue->push(mf);
             m_CTR_BMT_Buffer->pop();
             BMT_busy = true;
@@ -361,12 +391,12 @@ void mee::CTR_cycle() {
             // }
             // delete mf_return;//删除1
         } else {    //CTR读MISS返回，CTR写一定命中
-            assert(!mf_return->is_write());
+            // assert(!mf_return->is_write());
                 // print_addr("MISS OTP:\t\t", mf_return);
             if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
-                m_OTP_queue->push(new unsigned(m_OTP_table[(new_addr_type) mf_return->get_original_mf()]));   //得到CTR值，计算OTP用于解密
-                if (mf_return->get_sub_partition_id() == 24)
-                    printf("%p OTP MISS\n", mf_return->get_original_mf());
+                m_OTP_queue->push(new unsigned(mf_return->get_id()));   //得到CTR值，计算OTP用于解密
+                if (mf_return->get_sub_partition_id() == 60)
+                    printf("%p OTP %d MISS\n", mf_return->get_original_mf(), mf_return->get_id());
                 // m_CTR_BMT_Buffer->push(mf_return);
                 m_CTR_RET_queue->pop();
             }
@@ -385,39 +415,47 @@ void mee::CTR_cycle() {
         if (mf->is_write()) {
             if (m_CTRcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
                 return;
+            } else {
+                printf("MMMMMMMMMMMMMM\n");
             }
         }
 
         if (mf->get_access_type() != META_RBW) {
-            m_OTP_table[(new_addr_type)mf->get_original_mf()] = ++OTP_couter;  //生成<加密/解密, OTP>任务
-            if (mf->get_sub_partition_id() == 24)
-                printf("ins <%p, %u>\n", mf->get_original_mf(), OTP_couter);
+            // m_OTP_table[(new_addr_type)mf->get_original_mf()] = ++OTP_counter;  //生成<加密/解密, OTP>任务
+            // if (mf->get_sub_partition_id() == 24)
+            //     printf("ins <%p, %u>\n", mf->get_original_mf(), OTP_counter);
         }
 
         std::list<cache_event> events;
         enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
-        if (mf->get_sub_partition_id() == 24 || mf->get_sub_partition_id() == 25)
-            printf("%d ", status);
-        print_addr("CTR cycle access:\t\t", mf);
+        // if (mf->get_sub_partition_id() == 24 || mf->get_sub_partition_id() == 25)
+        //     printf("%d ", status);
+        // print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
             // if (!m_OTP_queue->full()) {
             // print_addr("HIT OTP:\t\t", mf);
-            m_OTP_queue->push(new unsigned(OTP_couter));  //CTR HIT后计算OTP用于加密/解密
-            if (mf->get_sub_partition_id() == 24)
-                printf("%p OTP HIT\n", mf->get_original_mf());
+            
+            if (mf->get_sub_partition_id() == 60)
+                printf("%p OTP %d HIT\n", mf->get_original_mf(), mf->get_id());
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
                 // m_CTR_BMT_Buffer->push(mf);
             }
+            if (mf->get_access_type() != META_RBW) {
+                m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
+                OTP_counter++;
+            }
             // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
             // print_addr("CTR cycle access:\t\t", mf);
             m_CTR_queue->pop();
+            if (mf->get_access_type() != META_RBW)
+                OTP_counter++;
         } else {
-            print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
+            // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
             // if (get_sub_partition_id(mf) == 0)
             //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             assert(!write_sent);
@@ -455,16 +493,16 @@ void mee::MAC_cycle() {
     bool output_full = m_MAC_CHECK_queue->full() || m_MAC_RET_queue->full();// && 
     bool port_free = m_unit->m_MACcache->data_port_free();
     
-    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
+    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free && MAC_counter < CT_counter) {
         mem_fetch *mf = m_MAC_queue->top();
         // print_addr("MAC cycle access:\t\t", mf);
 
         if (mf->is_write()) {   //对于写MAC请求，则应等待密文被Hash为新MAC值
-            if (!m_MAC_set[m_MAC_table[(new_addr_type) mf->get_original_mf()]]) {
+            if (!m_MAC_set[mf->get_id()]) {
                 return;
             }
         } else {    //对于读MAC请求，生成<MAC，Hash(密文)>的MAC Check任务
-            // m_MAC_table[(new_addr_type)mf->get_original_mf()] = ++MAC_couter;
+            // m_MAC_table[(new_addr_type)mf->get_original_mf()] = ++MAC_counter;
         }
 
         std::list<cache_event> events;
@@ -476,24 +514,26 @@ void mee::MAC_cycle() {
             // if (!m_OTP_queue->full()) {
             // print_addr("HIT OTP:\t\t", mf);
             if (mf->is_write()) {   //MAC写HIT，则MAC Hash值使用结束
-                m_MAC_set[m_MAC_table[(new_addr_type) mf->get_original_mf()]]--;
+                m_MAC_set[mf->get_id()]--;
             } else {
                 m_MAC_CHECK_queue->push(mf);    //MAC读HIT，得到MAC值，发往MAC Check
             }
             m_MAC_queue->pop();
+            MAC_counter++;
             // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
             // print_addr("CTR cycle access:\t\t", mf);
             if (mf->is_write()) {   //MAC写MISS，则MAC Hash值使用结束
-                m_MAC_set[m_MAC_table[(new_addr_type) mf->get_original_mf()]]--;
+                m_MAC_set[mf->get_id()]--;
             }
             m_MAC_queue->pop();
+            MAC_counter++;
         } else {
             // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
             // if (get_sub_partition_id(mf) == 0)
             //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
-            print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
+            // print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
             assert(!write_sent);
             assert(!read_sent);
         }
@@ -537,8 +577,8 @@ void mee::BMT_cycle() {
         }
         
         if (mf->get_access_type() != META_RBW) {
-            m_BMT_table[(new_addr_type)mf] = ++BMT_couter;
-            m_BMT_HASH_queue->push(new unsigned(BMT_couter));
+            m_BMT_table[(new_addr_type)mf] = ++BMT_counter;
+            m_BMT_HASH_queue->push(new unsigned(BMT_counter));
         }
 
         std::list<cache_event> events;
@@ -566,6 +606,7 @@ void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fet
     if (m_METAcache->access_ready() && !m_META_RET_queue->full()) {
         mem_fetch *mf = m_METAcache->next_access();
         m_META_RET_queue->push(mf);
+        assert(mf->get_access_type() == META_ACC);
         // if (m_METAcache == m_BMTcache)
             print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
@@ -577,10 +618,10 @@ void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fet
     }
 }
 
-void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE) {
+void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type) {
     // if (m_METAcache == m_BMTcache) printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE);
     
-    if (((mf->get_addr() & MASK) == BASE) || m_METAcache->waiting_for_fill(mf)) {
+    if ((mf->get_data_type() == m_data_type) && m_METAcache->waiting_for_fill(mf)) {
         print_addr("wating for fill:\t\t", mf); 
         if (m_METAcache->fill_port_free()) {
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
@@ -594,6 +635,12 @@ void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_MET
             // }
             m_unit->dram_mee_queue_pop();
         }
+    } else if ((mf->get_data_type() == m_data_type) && !m_META_RET_queue->full()) {
+      if (mf->is_write() && mf->get_type() == WRITE_ACK)
+        mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
+                       m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+      m_META_RET_queue->push(mf);
+      m_unit->dram_mee_queue_pop();
     }
 }
 
@@ -607,57 +654,61 @@ void mee::simple_cycle(unsigned cycle) {
     // META Cache fill responses
     META_fill_responses(m_CTRcache, m_CTR_RET_queue, CTR_mask);
     META_fill_responses(m_MACcache, m_MAC_RET_queue, MAC_mask);
-    for (int layer = 1; layer <= 4; layer++){
-        META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[layer]);
-    }
+    // for (int layer = 1; layer <= 4; layer++){
+    //     META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[layer]);
+    // }
     // META_fill_responses(m_BMTcache);
 
     // dram to mee
     if (!m_unit->dram_mee_queue_empty()) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top();
         print_addr("Cipertext fill:", mf_return);
-        // print_addr("dram_mee_queue_top:\t", mf_return);
-        // mee to L2
+        if (
+            // mf_return->get_access_type() == L1_WR_ALLOC_R || 
+            mf_return->get_access_type() == L2_WR_ALLOC_R ||
+            // mf_return->get_access_type() == L1_WRBK_ACC || 
+            mf_return->get_access_type() == L2_WRBK_ACC) {
+            m_unit->dram_mee_queue_pop();
+        } else {
         
-        // META_fill(m_MACcache, mf_return, MAC_mask);
-        // META_fill(m_BMTcache, mf_return);
-        // if (!m_unit->mee_L2_queue_full()) {
-
-        if (mf_return->get_access_type() >= META_ACC) { // META访存的返回，需要响应
-            // printf("Success handle CTR_ACC: ");
-            // print_addr("META return to mee", mf_return);
-            // delete mf_return;
-            META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask, CTR_base);
-            META_fill(m_MACcache, m_MAC_RET_queue, mf_return, MAC_mask, MAC_base);
-            for (int layer = 1; layer <= 4; layer++){
-                META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[layer], BMT_base[layer]);
-            }
-        } else {    // 密文访存返回
-            // reply L2 read
-            // reply L2 write back
-            //m_unit->mee_L2_queue_push(m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id()), mf_return);
-            int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+            // print_addr("dram_mee_queue_top:\t", mf_return);
+            // mee to L2
             
-            if (mf_return->is_write()) { // write
-                // print_addr("mee to L2 W:\t", mf_return);
-                if (!m_unit->mee_L2_queue_full(spid)){
-                    m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
+            // META_fill(m_MACcache, mf_return, MAC_mask);
+            // META_fill(m_BMTcache, mf_return);
+            // if (!m_unit->mee_L2_queue_full()) {
+
+            if (mf_return->get_access_type() >= META_ACC) { // META访存的返回，需要响应
+                // printf("Success handle CTR_ACC: ");
+                // print_addr("META return to mee", mf_return);
+                // delete mf_return;
+                META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask, CTR_base, CTR);
+                META_fill(m_MACcache, m_MAC_RET_queue, mf_return, MAC_mask, MAC_base, MAC);
+                // for (int layer = 1; layer <= 4; layer++) {
+                //     META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[layer], BMT_base[layer], BMT);
+                // }
+            } else {    // 密文访存返回
+                // reply L2 read
+                // reply L2 write back
+                //m_unit->mee_L2_queue_push(m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id()), mf_return);
+                int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+                assert(mf_return->get_access_type() < META_ACC);
+                if (!m_Ciphertext_RET_queue->full()) {              
+                    // m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
+                    // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
+                    // assert(m_MAC_table[(new_addr_type)mf_return]);
+                    // m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
+                    m_Ciphertext_RET_queue->push(mf_return);
                     m_unit->dram_mee_queue_pop();
+                    // printf("HHHHHHHHHHHHHHHH");
+                } else {
+                    // printf("HHHHHHHHHHHHHHHH");
                 }
-            } else if (!m_AES_queue->full() && !m_MAC_HASH_queue->full()) {              // read
-                m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
-                m_MAC_table[(new_addr_type)mf_return] = ++MAC_couter;
-                assert(m_MAC_table[(new_addr_type)mf_return]);
-                m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
-                m_unit->dram_mee_queue_pop();
-                // printf("HHHHHHHHHHHHHHHH");
-            } else {
-                // printf("HHHHHHHHHHHHHHHH");
+                // print_addr("mee to L2: ", mf_return);
             }
-            // print_addr("mee to L2: ", mf_return);
         }
-        
-        // }
+    } else if (!m_unit->mee_dram_queue_empty()) {
+        // printf("SSSSSSSSSSSSSSS %d\n", );
     }
     // printf("L2 to mee queue: %d %d\n", m_unit->m_sub_partition[0]->m_L2_mee_queue->empty(), m_unit->m_sub_partition[0]->m_L2_mee_queue->empty());
     // L2 to mee
@@ -666,28 +717,37 @@ void mee::simple_cycle(unsigned cycle) {
         // print_addr("L2 to mee: ", mf);
         // mee to dram
         assert(mf->is_raw());
+        printf("TTTTTTTTTTTTTTTT\n");
         
-        if (!m_CTR_queue->full(2) && !m_MAC_queue->full()) {
-            assert(!mf->is_write());
+        if (!m_CTR_queue->full(2) && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
+            // assert(!mf->is_write());
             if (mf->is_write()) { // write
-            printf("AAAAAAAAAAAAAAAAAAAAAA");
-                if (mf->is_raw() && !m_AES_queue->full()) {
-                    gen_CTR_mf(mf, META_RBW, false);
-                    gen_CTR_mf(mf, META_ACC, true);
-                    gen_MAC_mf(mf, true);
-                    m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
-                    mf->set_cooked_status();
-                    // printf("BBBBBBBBBBBBBBBBB");
-                }
+                assert(mf->is_raw());
+                printf("LLLLLLLLLLLLLLLLLLL");
+                // if (!m_Ciphertext_queue->full()) {
+                mf_counter++;
+                mf->set_id(mf_counter);
+                gen_CTR_mf(mf, META_RBW, false, 0);
+                gen_CTR_mf(mf, META_ACC, true, mf_counter);
+                gen_MAC_mf(mf, true, mf_counter);
+                // m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
+                m_Ciphertext_queue->push(mf);
+                m_unit->L2_mee_queue_pop(cycle&1);
+                // mf->set_cooked_status();
+                // printf("BBBBBBBBBBBBBBBBB");
+                // }
             } else if (!m_unit->mee_dram_queue_full()) {              // read
                 // printf("CCCCCCCCCCCCCCCC");
-                m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
-                gen_CTR_mf(mf, META_ACC, false);
-                gen_MAC_mf(mf, false);
+                // m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
+                mf_counter++;
+                mf->set_id(mf_counter);
+                m_Ciphertext_queue->push(mf);
+                gen_CTR_mf(mf, META_ACC, false, mf_counter);
+                gen_MAC_mf(mf, false, mf_counter);
                 m_unit->L2_mee_queue_pop(cycle&1);
             }
         } else {
-            if (m_unit->get_mpid() <= 16){
+            if (m_unit->get_mpid() <= 32){
                 if (m_CTR_RET_queue->full())
                     printf("AAAAAAAAAAAAAAAAAAAAAA");
                 if (m_MAC_RET_queue->full())
@@ -698,11 +758,13 @@ void mee::simple_cycle(unsigned cycle) {
                     printf("DDDDDDDDDDDDDDDD");
                 // if (m_AES_queue->full())
                 //     printf("EEEEEEEEEEEEEEEE");
-                if (!m_unit->mee_dram_queue_empty())
+                if (m_unit->mee_dram_queue_empty())
                     printf("FFFFFFFFFFFFFFFFFF");
             }
                 
         }
+    } else {
+        // printf("GGGGGGGGGGGGGG\n");
     }
     MAC_CHECK_cycle();
     MAC_cycle();
@@ -710,7 +772,7 @@ void mee::simple_cycle(unsigned cycle) {
     BMT_cycle();
     AES_cycle();
     CTR_cycle();
-    // CT_cycle();
+    CT_cycle();
 }
 
 void mee::cycle(unsigned cycle) {
@@ -721,11 +783,13 @@ void mee::cycle(unsigned cycle) {
 //BMT erase
 //BMT write需要阻塞，CTR read可以连续访问 
 //BMT 写前读 ok
+
+//BMT
 //检查写操作
-//实现写密文阻塞
-//实现mf id匹配
-//增加密文队列
+//ok 读密文在CTR访存前阻塞
+//ok 实现mf id匹配
 //BMT不需要每层都Check
-//增加访存类型的属性
+//ok 增加访存类型的属性
 //单个HASH单元
-//
\ No newline at end of file
+//ok None Sector
+//lazy_fetch_on_read不能和None_Sector混用，因为设置modified会Sector_MISS
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 6b1c9a106..37c08c694 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -17,7 +17,9 @@ class mee {
         void simple_cycle(unsigned cycle);
         void print_addr(char s[], mem_fetch *mf);
         void print_tag();
-        void meta_access(fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr, unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc, mem_fetch *original_mf) const;
+        void meta_access(fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, 
+            unsigned size, bool wr, unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc, 
+            mem_fetch *original_mf, unsigned mf_id, enum data_type m_data_type) const;
         void CTR_cycle();
         void MAC_cycle();
         void BMT_cycle();
@@ -30,13 +32,13 @@ class mee {
         new_addr_type get_addr(new_addr_type partition_id, new_addr_type partition_addr);
 
         unsigned int get_BMT_Layer(new_addr_type addr);
-        void gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr);
-        void gen_MAC_mf(mem_fetch *mf, bool wr);
-        void gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size);
+        void gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr, unsigned mf_id);
+        void gen_MAC_mf(mem_fetch *mf, bool wr, unsigned mf_id);
+        void gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size, unsigned mf_id);
         bool META_queue_empty();
 
         void META_fill_responses(class l2_cache *m_METAcache,  fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK);
-        void META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE);
+        void META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type);
 
         bool CTR_busy();
         bool MAC_busy();
@@ -112,9 +114,11 @@ class mee {
         mem_fetch *BMT_ROOT_mf = NULL;
         int cnt = 0;
 
-        unsigned OTP_couter = 0;
-        unsigned MAC_couter = 0;
-        unsigned BMT_couter = 0;
+        unsigned mf_counter = 0;
+        unsigned CT_counter = 0;
+        unsigned OTP_counter = 0;
+        unsigned MAC_counter = 0;
+        unsigned BMT_counter = 0;
 
 
 };
\ No newline at end of file
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index a2a7aced2..9ef25d61c 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -70,6 +70,7 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
     m_raw_addr.sub_partition = m_original_mf->get_tlx_addr().sub_partition;
   }
   raw_data = true;
+  id = 0;
 }
 
 mem_fetch::~mem_fetch() { m_status = MEM_FETCH_DELETED; }
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index 905b7cf92..a3d12041a 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -33,6 +33,13 @@
 #include "../abstract_hardware_model.h"
 #include "addrdec.h"
 
+enum data_type {
+  DEFAULT = 0,
+  CTR,
+  MAC,
+  BMT
+};
+
 enum mf_type {
   READ_REQUEST = 0,
   WRITE_REQUEST,
@@ -129,8 +136,13 @@ class mem_fetch {
   mem_fetch *get_original_wr_mf() { return original_wr_mf; }
 
   bool is_raw() {return raw_data; }
-  
   void set_cooked_status() {raw_data = false; }
+  
+  unsigned get_id() { return this->id; }
+  void set_id(unsigned id) { this->id = id; }
+
+  enum data_type get_data_type() { return this->m_data_type; }
+  void set_data_type(enum data_type m_data_type) { this->m_data_type = m_data_type; }
 
  private:
   // request source information
@@ -179,7 +191,8 @@ class mem_fetch {
   mem_fetch *original_wr_mf;  // this pointer refers to the original write req,
                               // when fetch-on-write policy is used
   bool raw_data = true;
-
+  unsigned id;
+  enum data_type m_data_type = DEFAULT;
 };
 
 #endif

From 6069f36ccff98f108f7b769374b8dfad6f73e625 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 25 Aug 2024 07:07:23 +0800
Subject: [PATCH 104/133] mee v1.1.1

---
 .../base_mee/gpgpusim.config_base_mee         |  8 +-
 src/gpgpu-sim/gpu-cache.cc                    |  3 -
 src/gpgpu-sim/l2cache.cc                      |  7 +-
 src/gpgpu-sim/l2cache.h                       |  3 +
 src/gpgpu-sim/mee.cc                          | 86 ++++++++++---------
 src/gpgpu-sim/mee.h                           |  2 +-
 6 files changed, 60 insertions(+), 49 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index 394874c8f..79df9778d 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -149,8 +149,8 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
-#-gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  N:4:128:64,L:T:m:W:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
@@ -163,8 +163,8 @@
 -gpgpu_smem_latency 20
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
--gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
-#-gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 N:32:128:24,L:B:m:W:P,A:192:4,32000000:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index c8e65e831..373fa4b5f 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1084,9 +1084,6 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   }
 
   extra_mf_fields_lookup::iterator e = m_extra_mf_fields.find(mf);
-  if (e == m_extra_mf_fields.end()) {
-    printf("XXXXXXXXX%x %dXXXXXXXXXXXXXXX\n", mf->get_addr(), mf->get_access_type());
-  }
   assert(e != m_extra_mf_fields.end());
   assert(e->second.m_valid);
   mf->set_data_size(e->second.m_data_size);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 7b4a0bbcd..d682a3d87 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -558,7 +558,8 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   if (!m_config->m_L2_config.disabled()) {
     if (m_L2cache->access_ready() && !m_L2_icnt_queue->full()) {
       mem_fetch *mf = m_L2cache->next_access();
-              printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+            // if (mf->get_access_type() == 9)
+      printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
       if (mf->get_access_type() !=
           L2_WR_ALLOC_R) {  // Don't pass write allocate read request back to
@@ -586,7 +587,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   // DRAM to L2 (texture) and icnt (not texture)
   if (!m_mee_L2_queue->empty()) {
     mem_fetch *mf = m_mee_L2_queue->top();
-            printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+    printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
     if (!m_config->m_L2_config.disabled() && m_L2cache->waiting_for_fill(mf)) {
       if (m_L2cache->fill_port_free()) {
@@ -626,6 +627,8 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
                               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                   m_memcpy_cycle_offset,
                               events);
+            printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 access\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
         MEM_SUBPART_DPRINTF("Probing L2 cache Address=%llx, status=%u\n",
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index ab36c0aab..db37f9ba6 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -320,6 +320,9 @@ class L2interface : public mem_fetch_interface {
   virtual void push(mem_fetch *mf) {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
     m_unit->m_L2_mee_queue->push(mf);
+    if (mf->get_access_type() == 9)
+                        printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
     // printf("l2 to mee access type: %d\n",mf->get_access_type());
   }
 
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index d0a126448..7fca617d8 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -8,7 +8,7 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
-    unsigned len = 8;
+    unsigned len = 800;
     m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
@@ -195,9 +195,12 @@ void mee::CT_cycle() {
 
     if (!m_Ciphertext_queue->empty() && CT_counter < OTP_counter) {
         mem_fetch *mf = m_Ciphertext_queue->top();
+        // print_addr("L2 to mee:\t", mf);
+        if (mf->get_sub_partition_id() == 58)
+        var = mf->get_addr();
         if (mf->is_write()) { // write
             if (mf->is_raw() && !m_AES_queue->full()) {
-                printf("QQQQQQQQQQQQQQQQ\n");
+                // printf("QQQQQQQQQQQQQQQQ\n");
                 m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
                 mf->set_cooked_status();
                 // m_MAC_table[(new_addr_type)mf] = ++MAC_counter;
@@ -206,10 +209,10 @@ void mee::CT_cycle() {
                 // m_Ciphertext_queue->pop();   //加密完后才可以生成访存
             } else {
                 if (!mf->is_raw()) {
-                    printf("RRRRRRRRRRRRRRR");
+                    // printf("RRRRRRRRRRRRRRR");
                 }
                 if (m_AES_queue->full()) {
-                    printf("SSSSSSSSSSSSSSSSSSS");
+                    // printf("SSSSSSSSSSSSSSSSSSS");
                 }
             }
         } else if (!m_unit->mee_dram_queue_full()) {              // read
@@ -229,11 +232,11 @@ void mee::AES_cycle() {
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
         assert(OTP_id);
-        if (mf->is_write())
-            printf("PPPPPPPPPPPPPP\n");
+        // if (mf->is_write())
+        //     printf("PPPPPPPPPPPPPP\n");
         if (m_OTP_set[OTP_id]) {  // 得到了OTP和明文/密文，AES加密/解密完成 
             if (mf->is_write()) {   //加密
-                printf("OOOOOOOOOOOOOOOOOOOOOO\n");
+                // printf("OOOOOOOOOOOOOOOOOOOOOO\n");
                 if (!m_unit->mee_dram_queue_full() && !m_MAC_HASH_queue->full()) {
                     m_OTP_set[OTP_id]--;
                     // m_OTP_table[REQ_addr] = 0;
@@ -245,7 +248,7 @@ void mee::AES_cycle() {
                     m_AES_queue->pop();
                     // m_unit->L2_mee_queue_pop(spid);
                     m_Ciphertext_queue->pop();  //写密文发往DRAM
-                    printf("NNNNNNNNNNNNNNNNNNNNN\n");
+                    // printf("NNNNNNNNNNNNNNNNNNNNN\n");
                 }
             } else if (!m_unit->mee_L2_queue_full(spid)) {  //解密
                 m_OTP_set[OTP_id]--;
@@ -256,11 +259,11 @@ void mee::AES_cycle() {
                 m_AES_queue->pop();
                 
             } else {
-                printf("IIIIIIIIIIIIIIII\n");
+                // printf("IIIIIIIIIIIIIIII\n");
             }
         } else {
-            if (mf->is_write()) 
-                printf("%p %d AES waiting for OTP %d\n", mf, mf->get_sub_partition_id(), OTP_id);
+            // if (mf->is_write()) 
+            //     printf("%p %d AES waiting for OTP %d\n", mf, mf->get_sub_partition_id(), OTP_id);
         }
     }
 
@@ -287,7 +290,7 @@ void mee::MAC_CHECK_cycle() {
             m_MAC_set[HASH_id]--;
             // m_MAC_table[REQ_addr] = 0;
             m_MAC_CHECK_queue->pop();
-            printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
+            // printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         } else {
             // if (mf->get_sub_partition_id() == 32) 
                 // printf("%p %d MAC waiting for HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
@@ -395,8 +398,8 @@ void mee::CTR_cycle() {
                 // print_addr("MISS OTP:\t\t", mf_return);
             if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
                 m_OTP_queue->push(new unsigned(mf_return->get_id()));   //得到CTR值，计算OTP用于解密
-                if (mf_return->get_sub_partition_id() == 60)
-                    printf("%p OTP %d MISS\n", mf_return->get_original_mf(), mf_return->get_id());
+                // if (mf_return->get_sub_partition_id() == 60)
+                    // printf("%p OTP %d MISS\n", mf_return->get_original_mf(), mf_return->get_id());
                 // m_CTR_BMT_Buffer->push(mf_return);
                 m_CTR_RET_queue->pop();
             }
@@ -416,7 +419,7 @@ void mee::CTR_cycle() {
             if (m_CTRcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
                 return;
             } else {
-                printf("MMMMMMMMMMMMMM\n");
+                // printf("MMMMMMMMMMMMMM\n");
             }
         }
 
@@ -437,8 +440,8 @@ void mee::CTR_cycle() {
             // if (!m_OTP_queue->full()) {
             // print_addr("HIT OTP:\t\t", mf);
             
-            if (mf->get_sub_partition_id() == 60)
-                printf("%p OTP %d HIT\n", mf->get_original_mf(), mf->get_id());
+            // if (mf->get_sub_partition_id() == 60)
+            //     printf("%p OTP %d HIT\n", mf->get_original_mf(), mf->get_id());
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
                 // m_CTR_BMT_Buffer->push(mf);
@@ -608,7 +611,7 @@ void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fet
         m_META_RET_queue->push(mf);
         assert(mf->get_access_type() == META_ACC);
         // if (m_METAcache == m_BMTcache)
-            print_addr("fill responses:", mf);
+            // print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
         // delete mf;
     } else {
@@ -622,7 +625,7 @@ void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_MET
     // if (m_METAcache == m_BMTcache) printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE);
     
     if ((mf->get_data_type() == m_data_type) && m_METAcache->waiting_for_fill(mf)) {
-        print_addr("wating for fill:\t\t", mf); 
+        // print_addr("wating for fill:\t\t", mf); 
         if (m_METAcache->fill_port_free()) {
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                     m_memcpy_cycle_offset);
@@ -662,12 +665,14 @@ void mee::simple_cycle(unsigned cycle) {
     // dram to mee
     if (!m_unit->dram_mee_queue_empty()) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top();
-        print_addr("Cipertext fill:", mf_return);
+        // if (mf_return->get_sub_partition_id() == 58)
+        // print_addr("Cipertext fill:", mf_return);
         if (
             // mf_return->get_access_type() == L1_WR_ALLOC_R || 
-            mf_return->get_access_type() == L2_WR_ALLOC_R ||
-            // mf_return->get_access_type() == L1_WRBK_ACC || 
-            mf_return->get_access_type() == L2_WRBK_ACC) {
+            // mf_return->get_access_type() == L2_WR_ALLOC_R ||
+            mf_return->get_access_type() == L1_WRBK_ACC || 
+            mf_return->get_access_type() == L2_WRBK_ACC
+            ) {
             m_unit->dram_mee_queue_pop();
         } else {
         
@@ -714,16 +719,19 @@ void mee::simple_cycle(unsigned cycle) {
     // L2 to mee
     if (!m_unit->L2_mee_queue_empty(cycle&1)) {
         mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
+        // if (mf->get_access_type() == 9)
+        //                 printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
         // print_addr("L2 to mee: ", mf);
         // mee to dram
         assert(mf->is_raw());
-        printf("TTTTTTTTTTTTTTTT\n");
+        // printf("TTTTTTTTTTTTTTTT\n");
         
         if (!m_CTR_queue->full(2) && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
             // assert(!mf->is_write());
             if (mf->is_write()) { // write
                 assert(mf->is_raw());
-                printf("LLLLLLLLLLLLLLLLLLL");
+                // printf("LLLLLLLLLLLLLLLLLLL");
                 // if (!m_Ciphertext_queue->full()) {
                 mf_counter++;
                 mf->set_id(mf_counter);
@@ -747,20 +755,20 @@ void mee::simple_cycle(unsigned cycle) {
                 m_unit->L2_mee_queue_pop(cycle&1);
             }
         } else {
-            if (m_unit->get_mpid() <= 32){
-                if (m_CTR_RET_queue->full())
-                    printf("AAAAAAAAAAAAAAAAAAAAAA");
-                if (m_MAC_RET_queue->full())
-                    printf("BBBBBBBBBBBBBBBBB");
-                if (m_BMT_RET_queue->full())
-                    printf("CCCCCCCCCCCC");
-                if (m_AES_queue->full())
-                    printf("DDDDDDDDDDDDDDDD");
-                // if (m_AES_queue->full())
-                //     printf("EEEEEEEEEEEEEEEE");
-                if (m_unit->mee_dram_queue_empty())
-                    printf("FFFFFFFFFFFFFFFFFF");
-            }
+            // if (m_unit->get_mpid() <= 32){
+            //     if (m_CTR_RET_queue->full())
+            //         printf("AAAAAAAAAAAAAAAAAAAAAA");
+            //     if (m_MAC_RET_queue->full())
+            //         printf("BBBBBBBBBBBBBBBBB");
+            //     if (m_BMT_RET_queue->full())
+            //         printf("CCCCCCCCCCCC");
+            //     if (m_AES_queue->full())
+            //         printf("DDDDDDDDDDDDDDDD");
+            //     // if (m_AES_queue->full())
+            //     //     printf("EEEEEEEEEEEEEEEE");
+            //     // if (m_unit->mee_dram_queue_empty())
+            //     //     printf("FFFFFFFFFFFFFFFFFF");
+            // }
                 
         }
     } else {
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 37c08c694..498af7558 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -119,6 +119,6 @@ class mee {
         unsigned OTP_counter = 0;
         unsigned MAC_counter = 0;
         unsigned BMT_counter = 0;
-
+        int var;
 
 };
\ No newline at end of file

From 12fcca7c81a12c35cbdfd70ac4ce01eb70dbc357 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 25 Aug 2024 07:52:52 +0800
Subject: [PATCH 105/133] mee v1.1.2

---
 .../SM7_QV100/base_mee/gpgpusim.config_base_mee           | 2 +-
 src/gpgpu-sim/l2cache.cc                                  | 8 ++++++--
 src/gpgpu-sim/mee.cc                                      | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index 79df9778d..a3dbbbfc9 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -164,7 +164,7 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dl2 N:32:128:24,L:B:m:W:P,A:192:4,32000000:0,32
+-gpgpu_cache:dl2 N:32:128:24,L:B:m:W:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index d682a3d87..3579915c8 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -587,10 +587,11 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   // DRAM to L2 (texture) and icnt (not texture)
   if (!m_mee_L2_queue->empty()) {
     mem_fetch *mf = m_mee_L2_queue->top();
-    printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
     if (!m_config->m_L2_config.disabled() && m_L2cache->waiting_for_fill(mf)) {
       if (m_L2cache->fill_port_free()) {
+            printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
         mf->set_status(IN_PARTITION_L2_FILL_QUEUE,
                        m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
         m_L2cache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
@@ -627,7 +628,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
                               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                   m_memcpy_cycle_offset,
                               events);
-            printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 access\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+            printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\tstatus:%d\n", "L2 access\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type(), status);
 
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
@@ -671,6 +672,9 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
           // L2 cache accepted request
           m_icnt_L2_queue->pop();
         } else {
+          if (m_L2_mee_queue->full()) {
+            printf("FFFFFFFFFFFFFFFFFFFF\n");
+          }
           assert(!write_sent);
           assert(!read_sent);
           // L2 cache lock-up: will try again next cycle
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 7fca617d8..e3fb46b73 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -668,8 +668,8 @@ void mee::simple_cycle(unsigned cycle) {
         // if (mf_return->get_sub_partition_id() == 58)
         // print_addr("Cipertext fill:", mf_return);
         if (
-            // mf_return->get_access_type() == L1_WR_ALLOC_R || 
-            // mf_return->get_access_type() == L2_WR_ALLOC_R ||
+            mf_return->get_access_type() == L1_WR_ALLOC_R || 
+            mf_return->get_access_type() == L2_WR_ALLOC_R ||
             mf_return->get_access_type() == L1_WRBK_ACC || 
             mf_return->get_access_type() == L2_WRBK_ACC
             ) {

From a89f06c1da953fd9b43521f803166da3b7af13a2 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 25 Aug 2024 08:46:16 +0800
Subject: [PATCH 106/133] mee v1.1.2

---
 src/gpgpu-sim/l2cache.cc |  4 +++-
 src/gpgpu-sim/mee.cc     | 20 ++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 3579915c8..41964bdfe 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -587,9 +587,11 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   // DRAM to L2 (texture) and icnt (not texture)
   if (!m_mee_L2_queue->empty()) {
     mem_fetch *mf = m_mee_L2_queue->top();
-
+    // assert(mf_return->get_access_type() != 4);
     if (!m_config->m_L2_config.disabled() && m_L2cache->waiting_for_fill(mf)) {
+      assert(mf->get_access_type() != 4);
       if (m_L2cache->fill_port_free()) {
+        assert(mf->get_access_type() != 4);
             printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
         mf->set_status(IN_PARTITION_L2_FILL_QUEUE,
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index e3fb46b73..4de8e07d1 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -175,11 +175,16 @@ void mee::CT_cycle() {
         int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
         // if (mf_return->get_access_type() != L1_WR_ALLOC_R && mf_return->get_access_type() != L2_WR_ALLOC_R) {
         if (mf_return->is_write()) { // write
+        // assert(!mf_return->is_write());
             // print_addr("mee to L2 W:\t", mf_return);
-            // if (!m_unit->mee_L2_queue_full(spid)){
-            //     m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
+            if (!m_unit->mee_L2_queue_full(spid)){
+                // assert(!mf_return->is_write());
+                // assert(mf_return->get_access_type() != 4);
+                m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
                 m_Ciphertext_RET_queue->pop();
-            // }
+            } else  {
+                assert(mf_return->get_access_type() != 4);
+            }
         } else if (!m_AES_queue->full() && !m_MAC_HASH_queue->full()) {              // read
             m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
             // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
@@ -199,7 +204,9 @@ void mee::CT_cycle() {
         if (mf->get_sub_partition_id() == 58)
         var = mf->get_addr();
         if (mf->is_write()) { // write
+        // assert(!mf->is_write());
             if (mf->is_raw() && !m_AES_queue->full()) {
+                // assert(!mf->is_write());
                 // printf("QQQQQQQQQQQQQQQQ\n");
                 m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
                 mf->set_cooked_status();
@@ -236,8 +243,10 @@ void mee::AES_cycle() {
         //     printf("PPPPPPPPPPPPPP\n");
         if (m_OTP_set[OTP_id]) {  // 得到了OTP和明文/密文，AES加密/解密完成 
             if (mf->is_write()) {   //加密
+            // assert(!mf->is_write());
                 // printf("OOOOOOOOOOOOOOOOOOOOOO\n");
                 if (!m_unit->mee_dram_queue_full() && !m_MAC_HASH_queue->full()) {
+                    // assert(!mf->is_write());
                     m_OTP_set[OTP_id]--;
                     // m_OTP_table[REQ_addr] = 0;
                     m_unit->mee_dram_queue_push(mf);    //加密完后更新DRAM中的密文
@@ -665,14 +674,16 @@ void mee::simple_cycle(unsigned cycle) {
     // dram to mee
     if (!m_unit->dram_mee_queue_empty()) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top();
+        // assert(!mf_return->is_write());
         // if (mf_return->get_sub_partition_id() == 58)
         // print_addr("Cipertext fill:", mf_return);
         if (
             mf_return->get_access_type() == L1_WR_ALLOC_R || 
-            mf_return->get_access_type() == L2_WR_ALLOC_R ||
+            // mf_return->get_access_type() == L2_WR_ALLOC_R ||
             mf_return->get_access_type() == L1_WRBK_ACC || 
             mf_return->get_access_type() == L2_WRBK_ACC
             ) {
+                assert(mf_return->get_access_type() == 4 && !mf_return->is_write());
             m_unit->dram_mee_queue_pop();
         } else {
         
@@ -693,6 +704,7 @@ void mee::simple_cycle(unsigned cycle) {
                 //     META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[layer], BMT_base[layer], BMT);
                 // }
             } else {    // 密文访存返回
+                // assert(mf_return->get_access_type() != 4);
                 // reply L2 read
                 // reply L2 write back
                 //m_unit->mee_L2_queue_push(m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id()), mf_return);

From ac6c328d988dee4be8d8371066b04d966d28b986 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 25 Aug 2024 20:07:52 +0800
Subject: [PATCH 107/133] mee v1.2

---
 .../base_mee/gpgpusim.config_base_mee         |   4 +-
 src/abstract_hardware_model.h                 |   2 +-
 src/gpgpu-sim/gpu-cache.cc                    |  25 +++
 src/gpgpu-sim/gpu-cache.h                     |  16 ++
 src/gpgpu-sim/l2cache.cc                      |  25 ++-
 src/gpgpu-sim/l2cache.h                       |  12 +-
 src/gpgpu-sim/mee.cc                          | 181 +++++++++++-------
 src/gpgpu-sim/mee.h                           |  19 +-
 8 files changed, 181 insertions(+), 103 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index a3dbbbfc9..e15749037 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -150,7 +150,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 #-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_cache:dl1  N:4:128:64,L:T:m:W:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  N:4:128:64,L:T:m:W:L,A:512:8,160:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
@@ -164,7 +164,7 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dl2 N:32:128:24,L:B:m:W:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 N:32:128:24,L:B:m:W:P,A:192:4,320:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 509fdd4b1..e29c7b5d3 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -768,7 +768,7 @@ typedef std::bitset<SECTOR_CHUNCK_SIZE> mem_access_sector_mask_t;
       MA_TUP(TEXTURE_ACC_R), MA_TUP(GLOBAL_ACC_W), MA_TUP(LOCAL_ACC_W), \
       MA_TUP(L1_WRBK_ACC), MA_TUP(L2_WRBK_ACC), MA_TUP(INST_ACC_R),     \
       MA_TUP(L1_WR_ALLOC_R), MA_TUP(L2_WR_ALLOC_R), MA_TUP(META_ACC),   \
-      MA_TUP(META_RBW),\
+      MA_TUP(META_RBW), MA_TUP(META_WRBK_ACC), MA_TUP(META_WR_ALLOC_R),\
       MA_TUP(NUM_MEM_ACCESS_TYPE) MA_TUP_END(mem_access_type)
 
 #define MA_TUP_BEGIN(X) enum X {
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 373fa4b5f..90e1fe9b3 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1349,6 +1349,10 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
                     mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
                     m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
 
+  n_mf->set_data_type(mf->get_data_type());
+  n_mf->set_chip(mf->get_tlx_addr().chip);
+  n_mf->set_parition(mf->get_tlx_addr().sub_partition);
+  assert(n_mf->get_sub_partition_id() == mf->get_sub_partition_id());
   bool do_miss = false;
   bool wb = false;
   evicted_block_info evicted;
@@ -1372,6 +1376,7 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
           NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
+      wb->set_data_type(mf->get_data_type());
       wb->set_chip(mf->get_tlx_addr().chip);
       wb->set_parition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
@@ -1425,6 +1430,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
             NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
+        wb->set_data_type(mf->get_data_type());
         wb->set_chip(mf->get_tlx_addr().chip);
         wb->set_parition(mf->get_tlx_addr().sub_partition);
         send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
@@ -1474,6 +1480,9 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         mf->get_tpc(), mf->get_mem_config(),
         m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, NULL, mf);
 
+    n_mf->set_data_type(mf->get_data_type());
+    n_mf->set_chip(mf->get_tlx_addr().chip);
+    n_mf->set_parition(mf->get_tlx_addr().sub_partition);
     new_addr_type block_addr = m_config.block_addr(addr);
     bool do_miss = false;
     bool wb = false;
@@ -1498,6 +1507,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
             NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
+        wb->set_data_type(mf->get_data_type());
         wb->set_chip(mf->get_tlx_addr().chip);
         wb->set_parition(mf->get_tlx_addr().sub_partition);
         send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
@@ -1565,6 +1575,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
           NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
+      wb->set_data_type(mf->get_data_type());
       wb->set_chip(mf->get_tlx_addr().chip);
       wb->set_parition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
@@ -1645,6 +1656,7 @@ enum cache_request_status data_cache::rd_miss_base(
           evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
           true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
           NULL);
+      wb->set_data_type(mf->get_data_type());
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1799,6 +1811,19 @@ enum cache_request_status l2_cache::probe(new_addr_type addr, mem_fetch *mf) con
   return data_cache::probe(addr, mf);
 }
 
+// The l2 cache access function calls the base data_cache access
+// implementation.  When the L2 needs to diverge from L1, L2 specific
+// changes should be made here.
+enum cache_request_status meta_cache::access(new_addr_type addr, mem_fetch *mf,
+                                           unsigned time,
+                                           std::list<cache_event> &events) {
+  return data_cache::access(addr, mf, time, events);
+}
+
+enum cache_request_status meta_cache::probe(new_addr_type addr, mem_fetch *mf) const {
+  return data_cache::probe(addr, mf);
+}
+
 /// Access function for tex_cache
 /// return values: RESERVATION_FAIL if request could not be accepted
 /// otherwise returns HIT_RESERVED or MISS; NOTE: *never* returns HIT
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index f5011c956..8c223fd57 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1705,6 +1705,22 @@ class l2_cache : public data_cache {
   virtual enum cache_request_status probe(new_addr_type addr, mem_fetch *mf) const;
 };
 
+class meta_cache : public data_cache {
+ public:
+  meta_cache(const char *name, cache_config &config, int core_id, int type_id,
+           mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
+           enum mem_fetch_status status, class gpgpu_sim *gpu)
+      : data_cache(name, config, core_id, type_id, memport, mfcreator, status,
+                   META_WR_ALLOC_R, META_WRBK_ACC, gpu) {}
+
+  virtual ~meta_cache() {}
+
+  virtual enum cache_request_status access(new_addr_type addr, mem_fetch *mf,
+                                           unsigned time,
+                                           std::list<cache_event> &events);
+  virtual enum cache_request_status probe(new_addr_type addr, mem_fetch *mf) const;
+};
+
 /*****************************************************************************/
 
 // See the following paper to understand this cache model:
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 41964bdfe..e93103c4e 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -87,15 +87,15 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
   m_metainterface = new metainterface(this);
   m_mf_allocator = new partition_mf_allocator(config);
 
-  if (!m_config->m_L2_config.disabled()) {
+  if (!m_config->m_META_config.disabled()) {
     m_CTRcache =
-        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
     m_MACcache =
-        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
     m_BMTcache =
-        new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
   }
 
@@ -376,7 +376,7 @@ void memory_partition_unit::dram_cycle() {
     if (!mee_dram_queue_empty() &&
         can_issue_to_dram(spid)) {
       mem_fetch *mf = mee_dram_queue_top();
-
+    
       if (global_sub_partition_id_to_local_id(mf->get_sub_partition_id()) != spid) continue;
 
       if (m_dram->full(mf->is_write())) break;
@@ -458,7 +458,7 @@ void memory_partition_unit::print(FILE *fp) const {
 
 void memory_partition_unit::accumulate_METAcache_stats(
     class cache_stats &l2_stats, char META[]) const {
-  class l2_cache *m_METAcache;
+  class meta_cache *m_METAcache;
   if (strcmp(META, "CTR") == 0) {
     m_METAcache = m_CTRcache;
   } else if (strcmp(META, "MAC") == 0) {
@@ -476,7 +476,7 @@ void memory_partition_unit::accumulate_METAcache_stats(
 
 void memory_partition_unit::get_METAcache_sub_stats(
     struct cache_sub_stats &css, char META[]) const {
-  class l2_cache *m_METAcache;
+  class meta_cache *m_METAcache;
   if (strcmp(META, "CTR") == 0) {
     m_METAcache = m_CTRcache;
   } else if (strcmp(META, "MAC") == 0) {
@@ -559,7 +559,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
     if (m_L2cache->access_ready() && !m_L2_icnt_queue->full()) {
       mem_fetch *mf = m_L2cache->next_access();
             // if (mf->get_access_type() == 9)
-      printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+      // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
       if (mf->get_access_type() !=
           L2_WR_ALLOC_R) {  // Don't pass write allocate read request back to
@@ -587,12 +587,13 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   // DRAM to L2 (texture) and icnt (not texture)
   if (!m_mee_L2_queue->empty()) {
     mem_fetch *mf = m_mee_L2_queue->top();
+                // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
     // assert(mf_return->get_access_type() != 4);
     if (!m_config->m_L2_config.disabled() && m_L2cache->waiting_for_fill(mf)) {
       assert(mf->get_access_type() != 4);
       if (m_L2cache->fill_port_free()) {
         assert(mf->get_access_type() != 4);
-            printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
         mf->set_status(IN_PARTITION_L2_FILL_QUEUE,
                        m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
@@ -617,6 +618,8 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   // new L2 texture accesses and/or non-texture accesses
   if (!m_L2_mee_queue->full() && !m_icnt_L2_queue->empty()) {
     mem_fetch *mf = m_icnt_L2_queue->top();
+                // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 access\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
     if (!m_config->m_L2_config.disabled() &&
         ((m_config->m_L2_texure_only && mf->istexture()) ||
          (!m_config->m_L2_texure_only))) {
@@ -630,7 +633,6 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
                               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                   m_memcpy_cycle_offset,
                               events);
-            printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\tstatus:%d\n", "L2 access\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type(), status);
 
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
@@ -674,9 +676,6 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
           // L2 cache accepted request
           m_icnt_L2_queue->pop();
         } else {
-          if (m_L2_mee_queue->full()) {
-            printf("FFFFFFFFFFFFFFFFFFFF\n");
-          }
           assert(!write_sent);
           assert(!read_sent);
           // L2 cache lock-up: will try again next cycle
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index db37f9ba6..5bd0950de 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -139,9 +139,9 @@ class memory_partition_unit {
   // class memory_sub_partition **m_sub_partition;
   class dram_t *m_dram;
 
-  class l2_cache *m_CTRcache;
-  class l2_cache *m_MACcache;
-  class l2_cache *m_BMTcache;
+  class meta_cache *m_CTRcache;
+  class meta_cache *m_MACcache;
+  class meta_cache *m_BMTcache;
   class mee *m_mee;
   class metainterface *m_metainterface;
   partition_mf_allocator *m_mf_allocator;
@@ -320,8 +320,8 @@ class L2interface : public mem_fetch_interface {
   virtual void push(mem_fetch *mf) {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
     m_unit->m_L2_mee_queue->push(mf);
-    if (mf->get_access_type() == 9)
-                        printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+    // if (mf->get_access_type() == 9)
+    //                     printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
     // printf("l2 to mee access type: %d\n",mf->get_access_type());
   }
@@ -340,6 +340,8 @@ class metainterface : public mem_fetch_interface {
   }
   virtual void push(mem_fetch *mf) {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
+                            // printf("%saddr: %x\tmf_type: %d\tsp_addr: %x\taccess type:%d\n", "mee to dram:\t", mf->get_addr(), mf->get_data_type(), mf->get_partition_addr(), mf->get_access_type());
+
     m_unit->mee_dram_queue_push(mf);
   }
 
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 4de8e07d1..b730f372b 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -1,7 +1,7 @@
 #include "mee.h"
 #include <list>
 
-mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_cache *MACcache, class l2_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
+mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
     m_unit(unit), 
     m_CTRcache(CTRcache),
     m_MACcache(MACcache),
@@ -22,10 +22,10 @@ mee::mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_c
     m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
     m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
-    m_MAC_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
+    m_HASH_queue = new fifo_pipeline<hash>("meta-queue", 40, 40 + len);
     m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
-    m_BMT_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
+    // m_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
     m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
@@ -35,9 +35,8 @@ int decode(int addr) {
     return (addr & 16128) >> 8;
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
-    if (mf->get_sub_partition_id() == 24 || mf->get_sub_partition_id() == 25 || true) {
-        printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\tmask_id: %d\tmask_addr:%x\taccess type:%d\n", s, mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), get_sub_partition_id(mf), get_partition_addr(mf), mf->get_access_type());
-        // print_tag();
+    if (mf->get_sub_partition_id() == 4 || true) {
+        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type());        // print_tag();
     }
 }
 
@@ -182,15 +181,15 @@ void mee::CT_cycle() {
                 // assert(mf_return->get_access_type() != 4);
                 m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
                 m_Ciphertext_RET_queue->pop();
-            } else  {
-                assert(mf_return->get_access_type() != 4);
+            // } else  {
+            //     assert(mf_return->get_access_type() != 4);
             }
-        } else if (!m_AES_queue->full() && !m_MAC_HASH_queue->full()) {              // read
+        } else if (!m_AES_queue->full() && !m_HASH_queue->full()) {              // read
             m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
             // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
             // assert(m_MAC_table[(new_addr_type)mf_return]);
-            m_MAC_HASH_queue->push(new unsigned(mf_return->get_id()));         //从DRAM中取到密文，对密文进行MAC Hash
-            // m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
+            m_HASH_queue->push(new hash(MAC, mf_return->get_id()));         //从DRAM中取到密文，对密文进行MAC Hash
+            // m_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
             m_Ciphertext_RET_queue->pop();
         }
         // } else {
@@ -212,7 +211,7 @@ void mee::CT_cycle() {
                 mf->set_cooked_status();
                 // m_MAC_table[(new_addr_type)mf] = ++MAC_counter;
                 // assert(m_MAC_table[(new_addr_type)mf]);
-                // m_MAC_HASH_queue->push(new unsigned(mf->get_id()));         //加密完后得到密文，对密文进行MAC Hash
+                // m_HASH_queue->push(new unsigned(mf->get_id()));         //加密完后得到密文，对密文进行MAC Hash
                 // m_Ciphertext_queue->pop();   //加密完后才可以生成访存
             } else {
                 if (!mf->is_raw()) {
@@ -245,7 +244,7 @@ void mee::AES_cycle() {
             if (mf->is_write()) {   //加密
             // assert(!mf->is_write());
                 // printf("OOOOOOOOOOOOOOOOOOOOOO\n");
-                if (!m_unit->mee_dram_queue_full() && !m_MAC_HASH_queue->full()) {
+                if (!m_unit->mee_dram_queue_full() && !m_HASH_queue->full()) {
                     // assert(!mf->is_write());
                     m_OTP_set[OTP_id]--;
                     // m_OTP_table[REQ_addr] = 0;
@@ -253,7 +252,7 @@ void mee::AES_cycle() {
                     CT_counter++;
                     // m_MAC_table[(new_addr_type)mf] = ++MAC_counter;
                     // assert(m_MAC_table[(new_addr_type)mf]);
-                    m_MAC_HASH_queue->push(new unsigned(mf->get_id()));          //加密完后得到密文，对密文进行MAC Hash
+                    m_HASH_queue->push(new hash(MAC, mf->get_id()));          //加密完后得到密文，对密文进行MAC Hash
                     m_AES_queue->pop();
                     // m_unit->L2_mee_queue_pop(spid);
                     m_Ciphertext_queue->pop();  //写密文发往DRAM
@@ -296,7 +295,7 @@ void mee::MAC_CHECK_cycle() {
         //     printf("%x\n", OTP_addr);
         assert(HASH_id);
         if (m_MAC_set[HASH_id]) { //得到了MAC与Hash值，MAC Check完成
-            m_MAC_set[HASH_id]--;
+            // m_MAC_set[HASH_id]--;
             // m_MAC_table[REQ_addr] = 0;
             m_MAC_CHECK_queue->pop();
             // printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
@@ -306,14 +305,19 @@ void mee::MAC_CHECK_cycle() {
         }
     }
 
-    if (!m_MAC_HASH_queue->empty()) {
+    if (!m_HASH_queue->empty()) {
         // printf("BBBBBBBBBBBBBBB\n");
-        unsigned *mf = m_MAC_HASH_queue->top();
+        hash *mf = m_HASH_queue->top();
         if (mf) {
-            m_MAC_set[*mf]++; //MAC Hash计算完成
+            if (mf->first == MAC)
+                m_MAC_set[mf->first]++; //MAC Hash计算完成
+            if (mf->first == BMT)
+                m_BMT_set[mf->first]++; //BMT Hash计算完成
+            m_HASH_queue->pop();
         }
         // delete mf;
-        m_MAC_HASH_queue->pop();
+        else 
+            m_HASH_queue->pop();
     }
 }
 
@@ -322,14 +326,12 @@ void mee::BMT_CHECK_cycle() {
         // printf("AAAAAAAAAAAAA\n");
         mem_fetch *mf = m_BMT_CHECK_queue->top();
         new_addr_type REQ_addr = (new_addr_type) mf;    //BMT Cache的值
-        unsigned HASH_id = m_BMT_table[REQ_addr];    //BMT Hash值
+        unsigned HASH_id = mf->get_id();    //BMT Hash值
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
         // assert(mf);
-        if (m_BMT_set[HASH_id] && !m_BMT_queue->full(2) && !m_BMT_HASH_queue->full()) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
+        if (m_BMT_set[HASH_id] && !m_BMT_queue->full(2) && !m_HASH_queue->full()) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
             m_BMT_set[HASH_id]--;
-            m_BMT_table[REQ_addr] = 0;
-            m_BMT_table.erase(m_BMT_table.find(REQ_addr));
             m_BMT_CHECK_queue->pop();
             // BMT_busy = false;
             // print_addr("BMT Check finish:", mf);
@@ -338,26 +340,26 @@ void mee::BMT_CHECK_cycle() {
             
             // printf("BBBBBB");
             //计算下一层BMT
-            if (!mf || get_BMT_Layer(mf->get_addr()) == 5) {
+            if (get_BMT_Layer(mf->get_addr()) == 4) {
                 // printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
                 cnt--;
-            } else if (get_BMT_Layer(mf->get_addr()) == 4) {
+            } else if (get_BMT_Layer(mf->get_addr()) == 3) {
                 // printf("AAAAAAAAAAAA\n");
                 assert(!m_BMT_CHECK_queue->full());
-                m_BMT_CHECK_queue->push(BMT_ROOT_mf);
-                m_BMT_table[(new_addr_type) BMT_ROOT_mf] = ++BMT_counter;
-                assert(!m_BMT_HASH_queue->full());
-                m_BMT_HASH_queue->push(new unsigned(BMT_counter));
+                m_BMT_CHECK_queue->push(mf);
+                assert(!m_HASH_queue->full());
+                m_HASH_queue->push(new hash(BMT, HASH_id));
             } else {
                 // printf("XXXXXXXXXXXXX\n");
                 assert(get_BMT_Layer(mf->get_addr()) && get_BMT_Layer(mf->get_addr())<4);
                 assert(!m_BMT_queue->full(2));
-                gen_BMT_mf(mf->get_original_mf(), false, META_RBW, 128, 0);
+                if (mf->is_write())
+                    gen_BMT_mf(mf->get_original_mf(), false, META_RBW, 128, 0);
                 assert(!m_BMT_queue->full());
-                gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, 0);
-                assert(!m_BMT_HASH_queue->full());
-                // m_BMT_HASH_queue->push(m_BMT_table[(new_addr_type) mf]);
+                gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id);
+                assert(!m_HASH_queue->full());
+                m_HASH_queue->push(new hash(BMT, HASH_id));
             }
             
             // if (REQ_addr == (new_addr_type) BMT_ROOT_mf) {
@@ -367,23 +369,25 @@ void mee::BMT_CHECK_cycle() {
         }
     }
 
-    if (!m_BMT_HASH_queue->empty()) {
-        // printf("BBBBBBBBBBBBBBB\n");
-        unsigned *mf = m_BMT_HASH_queue->top();
-        if (mf) {
-            m_BMT_set[*mf]++; //BMT Hash计算完成
-        }
-        // delete mf;
-        m_BMT_HASH_queue->pop();
-    }
+    // if (!m_HASH_queue->empty()) {
+    //     // printf("BBBBBBBBBBBBBBB\n");
+    //     hash *mf = m_HASH_queue->top();
+    //     if (mf) {
+    //         if (mf->first == BMT)
+    //             m_BMT_set[mf->first]++; //BMT Hash计算完成
+    //     }
+    //     // delete mf;
+    //     else
+    //         m_HASH_queue->pop();
+    // }
 
     // CTR to BMT
-    if (!m_CTR_BMT_Buffer->empty() && !m_BMT_queue->full() && !m_BMT_HASH_queue->full() && !BMT_busy && m_BMT_table.empty()) {
+    if (!m_CTR_BMT_Buffer->empty() && !m_BMT_queue->full() && !m_HASH_queue->full() && !BMT_busy && m_BMT_table.empty()) {
         assert(cnt==0);
         // assert(cnt);
         mem_fetch *mf = m_CTR_BMT_Buffer->top();
-            gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, 0);
-            // m_BMT_HASH_queue->push(mf);
+            gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, mf->get_id());
+            m_HASH_queue->push(new hash(BMT, mf->get_id()));
             m_CTR_BMT_Buffer->pop();
             BMT_busy = true;
             cnt++;
@@ -409,7 +413,7 @@ void mee::CTR_cycle() {
                 m_OTP_queue->push(new unsigned(mf_return->get_id()));   //得到CTR值，计算OTP用于解密
                 // if (mf_return->get_sub_partition_id() == 60)
                     // printf("%p OTP %d MISS\n", mf_return->get_original_mf(), mf_return->get_id());
-                // m_CTR_BMT_Buffer->push(mf_return);
+                m_CTR_BMT_Buffer->push(mf_return);
                 m_CTR_RET_queue->pop();
             }
         }
@@ -453,7 +457,7 @@ void mee::CTR_cycle() {
             //     printf("%p OTP %d HIT\n", mf->get_original_mf(), mf->get_id());
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
-                // m_CTR_BMT_Buffer->push(mf);
+                m_CTR_BMT_Buffer->push(mf);
             }
             if (mf->get_access_type() != META_RBW) {
                 m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
@@ -526,7 +530,7 @@ void mee::MAC_cycle() {
             // if (!m_OTP_queue->full()) {
             // print_addr("HIT OTP:\t\t", mf);
             if (mf->is_write()) {   //MAC写HIT，则MAC Hash值使用结束
-                m_MAC_set[mf->get_id()]--;
+                // m_MAC_set[mf->get_id()]--;
             } else {
                 m_MAC_CHECK_queue->push(mf);    //MAC读HIT，得到MAC值，发往MAC Check
             }
@@ -537,7 +541,7 @@ void mee::MAC_cycle() {
             // set wating for CTR fill
             // print_addr("CTR cycle access:\t\t", mf);
             if (mf->is_write()) {   //MAC写MISS，则MAC Hash值使用结束
-                m_MAC_set[mf->get_id()]--;
+                // m_MAC_set[mf->get_id()]--;
             }
             m_MAC_queue->pop();
             MAC_counter++;
@@ -583,15 +587,10 @@ void mee::BMT_cycle() {
 
         if (mf->is_write()) {
             //对于BMT写，要等待上一层BMT Hash计算完，得到新的BMT值，才可以更新当前层BMT
-            if (!m_BMT_set[(new_addr_type)mf->get_original_mf()]) { 
+            if (!m_BMT_set[mf->get_id()]) { 
                 return;
             }
         }
-        
-        if (mf->get_access_type() != META_RBW) {
-            m_BMT_table[(new_addr_type)mf] = ++BMT_counter;
-            m_BMT_HASH_queue->push(new unsigned(BMT_counter));
-        }
 
         std::list<cache_event> events;
         enum cache_request_status status = m_BMTcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
@@ -614,11 +613,12 @@ void mee::BMT_cycle() {
     }
 };
 
-void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK) {
+void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK) {
     if (m_METAcache->access_ready() && !m_META_RET_queue->full()) {
         mem_fetch *mf = m_METAcache->next_access();
-        m_META_RET_queue->push(mf);
-        assert(mf->get_access_type() == META_ACC);
+        if (mf->get_access_type() == META_ACC)
+            m_META_RET_queue->push(mf);
+        // assert(mf->get_access_type() == META_ACC);
         // if (m_METAcache == m_BMTcache)
             // print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
@@ -630,14 +630,18 @@ void mee::META_fill_responses(class l2_cache *m_METAcache, fifo_pipeline<mem_fet
     }
 }
 
-void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type) {
+void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type) {
     // if (m_METAcache == m_BMTcache) printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE);
     
     if ((mf->get_data_type() == m_data_type) && m_METAcache->waiting_for_fill(mf)) {
         // print_addr("wating for fill:\t\t", mf); 
         if (m_METAcache->fill_port_free()) {
+            // assert(mf->get_access_type() != META_WR_ALLOC_R);
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                     m_memcpy_cycle_offset);
+            if (m_data_type == MAC)
+                print_addr("MAC fill:\t", mf);
+            assert(!mf->is_write());
             // if (m_METAcache == m_BMTcache)
             //     print_addr("fill:\t\t\t\t", mf);
                 // printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE); 
@@ -651,7 +655,7 @@ void mee::META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_MET
       if (mf->is_write() && mf->get_type() == WRITE_ACK)
         mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
                        m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-      m_META_RET_queue->push(mf);
+    //   m_META_RET_queue->push(mf);
       m_unit->dram_mee_queue_pop();
     }
 }
@@ -667,7 +671,7 @@ void mee::simple_cycle(unsigned cycle) {
     META_fill_responses(m_CTRcache, m_CTR_RET_queue, CTR_mask);
     META_fill_responses(m_MACcache, m_MAC_RET_queue, MAC_mask);
     // for (int layer = 1; layer <= 4; layer++){
-    //     META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[layer]);
+    META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[1]);
     // }
     // META_fill_responses(m_BMTcache);
 
@@ -676,12 +680,14 @@ void mee::simple_cycle(unsigned cycle) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top();
         // assert(!mf_return->is_write());
         // if (mf_return->get_sub_partition_id() == 58)
-        // print_addr("Cipertext fill:", mf_return);
-        if (
-            mf_return->get_access_type() == L1_WR_ALLOC_R || 
-            // mf_return->get_access_type() == L2_WR_ALLOC_R ||
-            mf_return->get_access_type() == L1_WRBK_ACC || 
-            mf_return->get_access_type() == L2_WRBK_ACC
+        print_addr("waiting for fill:\t", mf_return);
+        // printf("%saddr: %x\tdata_type: %d\tsp_addr: %x\taccess type:%d\n", "fill queue:\t", mf->get_addr(), mf->get_data_type(), mf->get_partition_addr(), mf->get_access_type());
+
+        if (false
+            // mf_return->get_access_type() == L1_WR_ALLOC_R || 
+            // // mf_return->get_access_type() == L2_WR_ALLOC_R ||
+            // mf_return->get_access_type() == L1_WRBK_ACC || 
+            // mf_return->get_access_type() == L2_WRBK_ACC
             ) {
                 assert(mf_return->get_access_type() == 4 && !mf_return->is_write());
             m_unit->dram_mee_queue_pop();
@@ -701,7 +707,7 @@ void mee::simple_cycle(unsigned cycle) {
                 META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask, CTR_base, CTR);
                 META_fill(m_MACcache, m_MAC_RET_queue, mf_return, MAC_mask, MAC_base, MAC);
                 // for (int layer = 1; layer <= 4; layer++) {
-                //     META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[layer], BMT_base[layer], BMT);
+                META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT);
                 // }
             } else {    // 密文访存返回
                 // assert(mf_return->get_access_type() != 4);
@@ -714,7 +720,7 @@ void mee::simple_cycle(unsigned cycle) {
                     // m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
                     // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
                     // assert(m_MAC_table[(new_addr_type)mf_return]);
-                    // m_MAC_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
+                    // m_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
                     m_Ciphertext_RET_queue->push(mf_return);
                     m_unit->dram_mee_queue_pop();
                     // printf("HHHHHHHHHHHHHHHH");
@@ -732,7 +738,7 @@ void mee::simple_cycle(unsigned cycle) {
     if (!m_unit->L2_mee_queue_empty(cycle&1)) {
         mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
         // if (mf->get_access_type() == 9)
-        //                 printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+                        // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 to mee:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
         // print_addr("L2 to mee: ", mf);
         // mee to dram
@@ -796,6 +802,35 @@ void mee::simple_cycle(unsigned cycle) {
 }
 
 void mee::cycle(unsigned cycle) {
+    if (!m_unit->dram_mee_queue_empty()) {
+        mem_fetch *mf_return = m_unit->dram_mee_queue_top();
+        int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+         if (false
+            // mf_return->get_is_write() ||
+            // mf_return->get_access_type() == L1_WR_ALLOC_R || 
+            // mf_return->get_access_type() == L2_WR_ALLOC_R ||
+            // mf_return->get_access_type() == L1_WRBK_ACC || 
+            // mf_return->get_access_type() == L2_WRBK_ACC
+            ) {
+                // assert(mf_return->get_access_type() == 4 && !mf_return->is_write());
+            m_unit->dram_mee_queue_pop();
+        } else {
+            if (!m_unit->mee_L2_queue_full(spid)) { 
+                // m_OTP_table[REQ_addr] = 0;
+                // print_addr("mee to L2 R:\t", mf);
+                m_unit->mee_L2_queue_push(spid, mf_return);
+                m_unit->dram_mee_queue_pop();
+                
+            }
+        }
+    }
+    if (!m_unit->L2_mee_queue_empty(cycle&1)) {
+        mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
+        if (!m_unit->mee_dram_queue_full()) {              
+            m_unit->mee_dram_queue_push(mf);
+            m_unit->L2_mee_queue_pop(cycle&1);
+        }
+    }
 }
 
 //BMT next Layer
@@ -804,11 +839,11 @@ void mee::cycle(unsigned cycle) {
 //BMT write需要阻塞，CTR read可以连续访问 
 //BMT 写前读 ok
 
-//BMT
-//检查写操作
+//ok BMT
+//ok 检查写操作
 //ok 读密文在CTR访存前阻塞
 //ok 实现mf id匹配
-//BMT不需要每层都Check
+//ok BMT不需要每层都Check
 //ok 增加访存类型的属性
 //单个HASH单元
 //ok None Sector
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 498af7558..e23aa8656 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -12,7 +12,7 @@
 
 class mee {
     public:
-        mee(class memory_partition_unit *unit, class l2_cache *CTRcache, class l2_cache *MACcache, class l2_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu);
+        mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu);
         void cycle(unsigned cycle);
         void simple_cycle(unsigned cycle);
         void print_addr(char s[], mem_fetch *mf);
@@ -37,8 +37,8 @@ class mee {
         void gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size, unsigned mf_id);
         bool META_queue_empty();
 
-        void META_fill_responses(class l2_cache *m_METAcache,  fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK);
-        void META_fill(class l2_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type);
+        void META_fill_responses(class meta_cache *m_METAcache,  fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK);
+        void META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type);
 
         bool CTR_busy();
         bool MAC_busy();
@@ -48,9 +48,10 @@ class mee {
 
         
     private:
-        class l2_cache *m_CTRcache;
-        class l2_cache *m_MACcache;
-        class l2_cache *m_BMTcache;
+        typedef std::pair<enum data_type, int> hash;
+        class meta_cache *m_CTRcache;
+        class meta_cache *m_MACcache;
+        class meta_cache *m_BMTcache;
         class memory_partition_unit *m_unit;
         const memory_config *m_config;
         class gpgpu_sim *m_gpu;
@@ -67,14 +68,14 @@ class mee {
         fifo_pipeline<unsigned> *m_OTP_queue;
         fifo_pipeline<mem_fetch> *m_AES_queue;
         
-        fifo_pipeline<unsigned> *m_MAC_HASH_queue;
+        fifo_pipeline<hash> *m_HASH_queue;
         fifo_pipeline<mem_fetch> *m_MAC_CHECK_queue;
 
         //m_CTR_BMT_Buffer-->m_BMT_CHECK_queue--|-->
-        //                |->m_BMT_HASH_queue---|
+        //                |->m_HASH_queue---|
         //              m_BMT_queue-->m_BMT_RET_queue-->
         fifo_pipeline<mem_fetch> *m_BMT_CHECK_queue;
-        fifo_pipeline<unsigned> *m_BMT_HASH_queue;
+        // fifo_pipeline<unsigned> *m_HASH_queue;
         fifo_pipeline<mem_fetch> *m_CTR_BMT_Buffer;
 
         //CTR: 1111 1110 0000 0000 0000 0000 0000 0000

From f918f1c366229197bbe1f833d9490907a1e09b8c Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Tue, 27 Aug 2024 14:37:19 +0800
Subject: [PATCH 108/133] mee v1.2

---
 src/gpgpu-sim/mee.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index b730f372b..2e2be2b07 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -845,6 +845,6 @@ void mee::cycle(unsigned cycle) {
 //ok 实现mf id匹配
 //ok BMT不需要每层都Check
 //ok 增加访存类型的属性
-//单个HASH单元
+//ok 单个HASH单元
 //ok None Sector
 //lazy_fetch_on_read不能和None_Sector混用，因为设置modified会Sector_MISS
\ No newline at end of file

From 1c7c4bbc806fc15034c2ded5cec2af66c028e87c Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Thu, 29 Aug 2024 23:59:05 +0800
Subject: [PATCH 109/133] mee v1.2.1

---
 src/gpgpu-sim/gpu-cache.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 90e1fe9b3..6e5b8eda7 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -270,7 +270,10 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
           return HIT;
         } else {
           idx = index;
-          return SECTOR_MISS;
+          if (m_config.m_cache_type == SECTOR)
+            return SECTOR_MISS;
+          else
+            return MISS;
         }
 
       } else if (line->is_valid_line() && line->get_status(mask) == INVALID) {

From d0d0be4b2109838bd423d82c84a7cae964035a5e Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 31 Aug 2024 00:57:44 +0800
Subject: [PATCH 110/133] mee v1.2.2

---
 .../base_mee/gpgpusim.config_base_mee         | 15 +--
 src/gpgpu-sim/mee.cc                          | 94 +++++--------------
 2 files changed, 30 insertions(+), 79 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index e15749037..cdac68eea 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -150,7 +150,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 #-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_cache:dl1  N:4:128:64,L:T:m:W:L,A:512:8,160:0,32
+-gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,A:512:8,160:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
@@ -164,22 +164,25 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dl2 N:32:128:24,L:B:m:W:P,A:192:4,320:0,32
+-gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,320:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
--gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
+-gpgpu_cache:dmeta N:4:128:4,L:B:m:L:X,A:8:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
 -gpgpu_memory_partition_indexing 2
 
 # 128 KB Inst.
--gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
 # Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
--gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
 # 64 KB Const
--gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 2e2be2b07..9ab9de24b 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -8,7 +8,7 @@ mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class me
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
-    unsigned len = 800;
+    unsigned len = 64;
     m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
@@ -35,8 +35,8 @@ int decode(int addr) {
     return (addr & 16128) >> 8;
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
-    if (mf->get_sub_partition_id() == 4 || true) {
-        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type());        // print_tag();
+    if (m_unit->get_mpid() == 0) {
+        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id());        // print_tag();
     }
 }
 
@@ -56,8 +56,6 @@ void mee::print_tag() {
 new_addr_type mee::get_partition_addr(mem_fetch *mf) {
     new_addr_type partition_addr = mf->get_addr() >> (8 + 6) << 8;
     partition_addr |= mf->get_addr() & ((1 << 8) - 1);
-    // return partition_addr;
-    // printf("%x %x\n", mf->get_addr(), mf->get_partition_addr());
     return mf->get_partition_addr();
 }
 
@@ -73,8 +71,6 @@ unsigned int mee::get_BMT_Layer(new_addr_type addr) {
             return i;
         }
     }
-    // if (addr == BMT_ROOT_mf)
-    //     return 5;
     return 5;
 }
 
@@ -86,7 +82,6 @@ new_addr_type mee::get_addr(new_addr_type sub_partition_id, new_addr_type partit
     new_addr_type new_addr = partition_addr >> 8 << (8 + 6);
     new_addr |= partition_addr & ((1 << 8) - 1);
     new_addr |= sub_partition_id << 8;
-    // printf("%x %x %x\n", new_addr, sub_partition_id, partition_addr);
     return new_addr;
 }
 
@@ -125,8 +120,6 @@ void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size
     new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
     BMT_addr |= 0xf2000000;
 
-    // printf("%llx %llx\n", mf->get_addr(), BMT_addr);
-
     meta_access(m_BMT_queue, BMT_addr, type, 
             size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
             mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, BMT);
@@ -150,7 +143,6 @@ void mee::meta_access(
     mem_fetch *mf = new mem_fetch(
         acc, NULL /*we don't have an instruction yet*/, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
         wid, sid, tpc, m_config, cycle, original_mf);
-    // mf->set_chip(original_mf->get_sub_partition_id)
 
     std::vector<mem_fetch *> reqs;
     if (m_config->m_META_config.m_cache_type == SECTOR)
@@ -188,20 +180,16 @@ void mee::CT_cycle() {
             m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
             // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
             // assert(m_MAC_table[(new_addr_type)mf_return]);
+            // if (m_unit->get_mpid() == 0)
+            //     printf("HASH :%d\n", mf_return->get_id());
             m_HASH_queue->push(new hash(MAC, mf_return->get_id()));         //从DRAM中取到密文，对密文进行MAC Hash
-            // m_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
             m_Ciphertext_RET_queue->pop();
         }
-        // } else {
-        //     m_Ciphertext_RET_queue->pop();
-        // }
     }
 
     if (!m_Ciphertext_queue->empty() && CT_counter < OTP_counter) {
         mem_fetch *mf = m_Ciphertext_queue->top();
         // print_addr("L2 to mee:\t", mf);
-        if (mf->get_sub_partition_id() == 58)
-        var = mf->get_addr();
         if (mf->is_write()) { // write
         // assert(!mf->is_write());
             if (mf->is_raw() && !m_AES_queue->full()) {
@@ -245,18 +233,12 @@ void mee::AES_cycle() {
             // assert(!mf->is_write());
                 // printf("OOOOOOOOOOOOOOOOOOOOOO\n");
                 if (!m_unit->mee_dram_queue_full() && !m_HASH_queue->full()) {
-                    // assert(!mf->is_write());
                     m_OTP_set[OTP_id]--;
-                    // m_OTP_table[REQ_addr] = 0;
                     m_unit->mee_dram_queue_push(mf);    //加密完后更新DRAM中的密文
                     CT_counter++;
-                    // m_MAC_table[(new_addr_type)mf] = ++MAC_counter;
-                    // assert(m_MAC_table[(new_addr_type)mf]);
                     m_HASH_queue->push(new hash(MAC, mf->get_id()));          //加密完后得到密文，对密文进行MAC Hash
                     m_AES_queue->pop();
-                    // m_unit->L2_mee_queue_pop(spid);
                     m_Ciphertext_queue->pop();  //写密文发往DRAM
-                    // printf("NNNNNNNNNNNNNNNNNNNNN\n");
                 }
             } else if (!m_unit->mee_L2_queue_full(spid)) {  //解密
                 m_OTP_set[OTP_id]--;
@@ -289,17 +271,20 @@ void mee::MAC_CHECK_cycle() {
     if (!m_MAC_CHECK_queue->empty()) {
         // printf("AAAAAAAAAAAAA\n");
         mem_fetch *mf = m_MAC_CHECK_queue->top();
+        print_addr("waiting for MAC Check:\t", mf);
         new_addr_type REQ_addr = (new_addr_type) mf->get_original_mf();    //MAC Cache的值
         unsigned HASH_id = mf->get_id();    //MAC Hash值
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
         assert(HASH_id);
         if (m_MAC_set[HASH_id]) { //得到了MAC与Hash值，MAC Check完成
-            // m_MAC_set[HASH_id]--;
+            // printf("MAC check: id %d sid %d\n", HASH_id, mf->get_sub_partition_id());
+            m_MAC_set[HASH_id]--;
             // m_MAC_table[REQ_addr] = 0;
             m_MAC_CHECK_queue->pop();
             // printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         } else {
+            // print_addr("waiting for MAC Check:\t", mf);
             // if (mf->get_sub_partition_id() == 32) 
                 // printf("%p %d MAC waiting for HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         }
@@ -309,10 +294,12 @@ void mee::MAC_CHECK_cycle() {
         // printf("BBBBBBBBBBBBBBB\n");
         hash *mf = m_HASH_queue->top();
         if (mf) {
+            // if (m_unit->get_mpid() == 0)
+            //     printf("type:%d HASH :%d\n", mf->first, mf->get_id());
             if (mf->first == MAC)
-                m_MAC_set[mf->first]++; //MAC Hash计算完成
+                m_MAC_set[mf->second]++; //MAC Hash计算完成
             if (mf->first == BMT)
-                m_BMT_set[mf->first]++; //BMT Hash计算完成
+                m_BMT_set[mf->second]++; //BMT Hash计算完成
             m_HASH_queue->pop();
         }
         // delete mf;
@@ -398,22 +385,14 @@ void mee::CTR_cycle() {
     if (!m_CTR_RET_queue->empty()) {
         mem_fetch *mf_return = m_CTR_RET_queue->top();
         if (mf_return->get_access_type() == META_RBW) {    //更新CTR前的CTR读MISS返回
-            // if (!m_CTR_queue->full()) {
             m_CTR_RET_queue->pop();
-            //     gen_CTR_mf(mf_return->get_original_mf(), META_ACC, true);   //更新CTR，生成写CTR的请求
-            // }
-            // else {
-            //     assert(!m_CTR_RET_queue->full());
-            // }
             // delete mf_return;//删除1
         } else {    //CTR读MISS返回，CTR写一定命中
-            // assert(!mf_return->is_write());
+            assert(!mf_return->is_write());
                 // print_addr("MISS OTP:\t\t", mf_return);
             if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
                 m_OTP_queue->push(new unsigned(mf_return->get_id()));   //得到CTR值，计算OTP用于解密
-                // if (mf_return->get_sub_partition_id() == 60)
-                    // printf("%p OTP %d MISS\n", mf_return->get_original_mf(), mf_return->get_id());
-                m_CTR_BMT_Buffer->push(mf_return);
+                // m_CTR_BMT_Buffer->push(mf_return);
                 m_CTR_RET_queue->pop();
             }
         }
@@ -431,33 +410,17 @@ void mee::CTR_cycle() {
         if (mf->is_write()) {
             if (m_CTRcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
                 return;
-            } else {
-                // printf("MMMMMMMMMMMMMM\n");
             }
         }
 
-        if (mf->get_access_type() != META_RBW) {
-            // m_OTP_table[(new_addr_type)mf->get_original_mf()] = ++OTP_counter;  //生成<加密/解密, OTP>任务
-            // if (mf->get_sub_partition_id() == 24)
-            //     printf("ins <%p, %u>\n", mf->get_original_mf(), OTP_counter);
-        }
-
         std::list<cache_event> events;
         enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
-        // if (mf->get_sub_partition_id() == 24 || mf->get_sub_partition_id() == 25)
-        //     printf("%d ", status);
-        // print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
-            // if (!m_OTP_queue->full()) {
-            // print_addr("HIT OTP:\t\t", mf);
-            
-            // if (mf->get_sub_partition_id() == 60)
-            //     printf("%p OTP %d HIT\n", mf->get_original_mf(), mf->get_id());
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
-                m_CTR_BMT_Buffer->push(mf);
+                // m_CTR_BMT_Buffer->push(mf);
             }
             if (mf->get_access_type() != META_RBW) {
                 m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
@@ -471,22 +434,10 @@ void mee::CTR_cycle() {
             if (mf->get_access_type() != META_RBW)
                 OTP_counter++;
         } else {
-            // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
-            // if (get_sub_partition_id(mf) == 0)
-            //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             assert(!write_sent);
             assert(!read_sent);
-            // printf("XXXXXXXXXXXXXXXXXXXXXXXXXX");
         }
-    } else {
-        // if (!m_CTR_queue->empty())
-        //     printf("GGGGGGGGGGGGGGGGGGGGGGGG");
     }
-    // else if (mf->get_sub_partition_id() == 1) {
-        // if (m_unit->mee_dram_queue_full()) printf("AAAAAAAAAAAAAA\n");
-        // if (m_OTP_queue->full()) printf("BBBBBBBBBBBBBBBBB\n");
-        // if (!m_OTP_queue->empty() && m_CTR_queue->empty()) printf("CCCCCCCCCCCCCCCCCCCCCCCC\n");
-    //}
 };
 
 void mee::MAC_cycle() {
@@ -517,8 +468,6 @@ void mee::MAC_cycle() {
             if (!m_MAC_set[mf->get_id()]) {
                 return;
             }
-        } else {    //对于读MAC请求，生成<MAC，Hash(密文)>的MAC Check任务
-            // m_MAC_table[(new_addr_type)mf->get_original_mf()] = ++MAC_counter;
         }
 
         std::list<cache_event> events;
@@ -527,8 +476,6 @@ void mee::MAC_cycle() {
         bool read_sent = was_read_sent(events);
         // print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
-            // if (!m_OTP_queue->full()) {
-            // print_addr("HIT OTP:\t\t", mf);
             if (mf->is_write()) {   //MAC写HIT，则MAC Hash值使用结束
                 // m_MAC_set[mf->get_id()]--;
             } else {
@@ -539,7 +486,7 @@ void mee::MAC_cycle() {
             // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
-            // print_addr("CTR cycle access:\t\t", mf);
+            // print_addr("MAC cycle access MISS:\t\t", mf);
             if (mf->is_write()) {   //MAC写MISS，则MAC Hash值使用结束
                 // m_MAC_set[mf->get_id()]--;
             }
@@ -620,7 +567,7 @@ void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_f
             m_META_RET_queue->push(mf);
         // assert(mf->get_access_type() == META_ACC);
         // if (m_METAcache == m_BMTcache)
-            // print_addr("fill responses:", mf);
+        print_addr("fill responses:", mf);
         // reply(m_METAcache, mf);
         // delete mf;
     } else {
@@ -639,8 +586,8 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
             // assert(mf->get_access_type() != META_WR_ALLOC_R);
             m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                     m_memcpy_cycle_offset);
-            if (m_data_type == MAC)
-                print_addr("MAC fill:\t", mf);
+            // if (m_data_type == MAC)
+            //     print_addr("MAC fill:\t", mf);
             assert(!mf->is_write());
             // if (m_METAcache == m_BMTcache)
             //     print_addr("fill:\t\t\t\t", mf);
@@ -737,6 +684,7 @@ void mee::simple_cycle(unsigned cycle) {
     // L2 to mee
     if (!m_unit->L2_mee_queue_empty(cycle&1)) {
         mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
+        // print_addr("waiting for access:\t", mf);
         // if (mf->get_access_type() == 9)
                         // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 to mee:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 

From d241f04355c6549115397ed1f00b0d45d460b4b7 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 31 Aug 2024 02:13:04 +0800
Subject: [PATCH 111/133] mee v1.2.2

---
 src/gpgpu-sim/mee.cc      | 55 ++++++++++++++++++++++-----------------
 src/gpgpu-sim/mem_fetch.h |  9 +++++--
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 9ab9de24b..9f0694d38 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -120,9 +120,11 @@ void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size
     new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
     BMT_addr |= 0xf2000000;
 
+    enum data_type BMT_type = static_cast<data_type>(mf->get_data_type() + 1);
+
     meta_access(m_BMT_queue, BMT_addr, type, 
             size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, BMT);
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, BMT_type);
 }
 
 void mee::meta_access(
@@ -298,7 +300,7 @@ void mee::MAC_CHECK_cycle() {
             //     printf("type:%d HASH :%d\n", mf->first, mf->get_id());
             if (mf->first == MAC)
                 m_MAC_set[mf->second]++; //MAC Hash计算完成
-            if (mf->first == BMT)
+            if (mf->first >= BMT)
                 m_BMT_set[mf->second]++; //BMT Hash计算完成
             m_HASH_queue->pop();
         }
@@ -327,26 +329,24 @@ void mee::BMT_CHECK_cycle() {
             
             // printf("BBBBBB");
             //计算下一层BMT
-            if (get_BMT_Layer(mf->get_addr()) == 4) {
+            if (mf->get_data_type() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
                 cnt--;
-            } else if (get_BMT_Layer(mf->get_addr()) == 3) {
-                // printf("AAAAAAAAAAAA\n");
-                assert(!m_BMT_CHECK_queue->full());
-                m_BMT_CHECK_queue->push(mf);
-                assert(!m_HASH_queue->full());
-                m_HASH_queue->push(new hash(BMT, HASH_id));
-            } else {
-                // printf("XXXXXXXXXXXXX\n");
-                assert(get_BMT_Layer(mf->get_addr()) && get_BMT_Layer(mf->get_addr())<4);
-                assert(!m_BMT_queue->full(2));
+            } 
+            // else if (mf->get_data_type() == BMT_L4) {
+            //     // printf("AAAAAAAAAAAA\n");
+            //     assert(!m_BMT_CHECK_queue->full());
+            //     assert(!m_HASH_queue->full());
+            //     mf->set_data_type(BMT_ROOT);
+            //     m_BMT_CHECK_queue->push(mf);//ROOT mf
+            //     m_HASH_queue->push(new hash(BMT, HASH_id));
+            // } 
+            else {
+                gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
                 if (mf->is_write())
-                    gen_BMT_mf(mf->get_original_mf(), false, META_RBW, 128, 0);
-                assert(!m_BMT_queue->full());
-                gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id);
+                    gen_BMT_mf(mf->get_original_mf(), true, META_RBW, 8, 0);
                 assert(!m_HASH_queue->full());
-                m_HASH_queue->push(new hash(BMT, HASH_id));
             }
             
             // if (REQ_addr == (new_addr_type) BMT_ROOT_mf) {
@@ -373,7 +373,8 @@ void mee::BMT_CHECK_cycle() {
         assert(cnt==0);
         // assert(cnt);
         mem_fetch *mf = m_CTR_BMT_Buffer->top();
-            gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, mf->get_id());
+            // gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, mf->get_id());
+            m_BMT_CHECK_queue->push(mf);
             m_HASH_queue->push(new hash(BMT, mf->get_id()));
             m_CTR_BMT_Buffer->pop();
             BMT_busy = true;
@@ -392,7 +393,7 @@ void mee::CTR_cycle() {
                 // print_addr("MISS OTP:\t\t", mf_return);
             if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
                 m_OTP_queue->push(new unsigned(mf_return->get_id()));   //得到CTR值，计算OTP用于解密
-                // m_CTR_BMT_Buffer->push(mf_return);
+                m_CTR_BMT_Buffer->push(mf_return);
                 m_CTR_RET_queue->pop();
             }
         }
@@ -420,7 +421,7 @@ void mee::CTR_cycle() {
         if (status == HIT) {
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
-                // m_CTR_BMT_Buffer->push(mf);
+                m_CTR_BMT_Buffer->push(mf);
             }
             if (mf->get_access_type() != META_RBW) {
                 m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
@@ -508,8 +509,9 @@ void mee::BMT_cycle() {
         mem_fetch *mf_return = m_BMT_RET_queue->top();
         // print_addr("MISS OTP:\t\t", mf_return);
         if (mf_return->get_access_type() != META_RBW) {
-            if (!m_BMT_CHECK_queue->full()) {
+            if (!m_BMT_CHECK_queue->full() && !m_HASH_queue->full()) {
                 m_BMT_CHECK_queue->push(mf_return);
+                m_HASH_queue->push(new hash(BMT, mf_return->get_id()));
                 m_BMT_RET_queue->pop();
             }
         } else {
@@ -519,7 +521,7 @@ void mee::BMT_cycle() {
 
     m_BMTcache->cycle();
 
-    bool output_full = m_BMT_CHECK_queue->full() || m_BMT_RET_queue->full();
+    bool output_full = m_BMT_CHECK_queue->full() || m_BMT_RET_queue->full() || m_HASH_queue->full();
     bool port_free = m_unit->m_BMTcache->data_port_free();
     
     if (!m_BMT_queue->empty()) {
@@ -548,8 +550,10 @@ void mee::BMT_cycle() {
             assert(status == HIT);
         }
         if (status == HIT) {
-            if (mf->get_access_type() != META_RBW)
+            if (mf->get_access_type() != META_RBW) {
                 m_BMT_CHECK_queue->push(mf);
+                m_HASH_queue->push(new hash(BMT, mf->get_id()));
+            }
             m_BMT_queue->pop();
         } else if (status != RESERVATION_FAIL) {
             m_BMT_queue->pop();
@@ -654,7 +658,10 @@ void mee::simple_cycle(unsigned cycle) {
                 META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask, CTR_base, CTR);
                 META_fill(m_MACcache, m_MAC_RET_queue, mf_return, MAC_mask, MAC_base, MAC);
                 // for (int layer = 1; layer <= 4; layer++) {
-                META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT);
+                    META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT_L1);
+                    META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT_L2);
+                    META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT_L3);
+                    META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT_L4);
                 // }
             } else {    // 密文访存返回
                 // assert(mf_return->get_access_type() != 4);
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index a3d12041a..c47cc7430 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -35,9 +35,14 @@
 
 enum data_type {
   DEFAULT = 0,
-  CTR,
   MAC,
-  BMT
+  BMT,
+  CTR,
+  BMT_L1,
+  BMT_L2,
+  BMT_L3,
+  BMT_L4,
+  BMT_ROOT
 };
 
 enum mf_type {

From ba10d4b4fa9ef1a356bf6184361bd249b9dfc561 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 31 Aug 2024 21:10:09 +0800
Subject: [PATCH 112/133] mee v1.2.5

---
 .../base_mee/gpgpusim.config_base_mee         |  2 +-
 src/gpgpu-sim/mee.cc                          | 65 ++++++++-----------
 2 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index cdac68eea..ece992cdb 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -150,7 +150,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 #-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,A:512:8,160:0,32
+-gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 9f0694d38..e192ed70a 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -1,5 +1,7 @@
 #include "mee.h"
 #include <list>
+#define BMT_Enable
+#define MAC_Enable
 
 mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
     m_unit(unit), 
@@ -36,7 +38,7 @@ int decode(int addr) {
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
     if (m_unit->get_mpid() == 0) {
-        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id());        // print_tag();
+        printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id());        // print_tag();
     }
 }
 
@@ -112,9 +114,9 @@ void mee::gen_MAC_mf(mem_fetch *mf, bool wr, unsigned mf_id) {
 void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size, unsigned mf_id) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
-    unsigned int Layer = get_BMT_Layer(mf->get_addr());
-    if (Layer == 4) //由L4生成ROOT，由于ROOT是单独的寄存器，这里不生成访存请求
-        return;
+    // unsigned int Layer = get_BMT_Layer(mf->get_addr());
+    // if (Layer == 4) //由L4生成ROOT，由于ROOT是单独的寄存器，这里不生成访存请求
+    //     return;
     partition_addr = partition_addr & 0x003fffff;
     partition_addr = partition_addr >> 7 << 3;
     new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
@@ -316,43 +318,25 @@ void mee::BMT_CHECK_cycle() {
         mem_fetch *mf = m_BMT_CHECK_queue->top();
         new_addr_type REQ_addr = (new_addr_type) mf;    //BMT Cache的值
         unsigned HASH_id = mf->get_id();    //BMT Hash值
+        assert(mf->get_access_type() != META_RBW);
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
         // assert(mf);
-        if (m_BMT_set[HASH_id] && !m_BMT_queue->full(2) && !m_HASH_queue->full()) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
+        if (m_BMT_set[HASH_id] && !m_BMT_queue->full(2)) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
             m_BMT_set[HASH_id]--;
             m_BMT_CHECK_queue->pop();
-            // BMT_busy = false;
-            // print_addr("BMT Check finish:", mf);
-            // if (mf->get_sub_partition_id() == 1) 
-            //     printf("%d %d\n", m_BMT_table.size(), m_BMT_table.empty());
-            
-            // printf("BBBBBB");
             //计算下一层BMT
             if (mf->get_data_type() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
-                cnt--;
-            } 
-            // else if (mf->get_data_type() == BMT_L4) {
-            //     // printf("AAAAAAAAAAAA\n");
-            //     assert(!m_BMT_CHECK_queue->full());
-            //     assert(!m_HASH_queue->full());
-            //     mf->set_data_type(BMT_ROOT);
-            //     m_BMT_CHECK_queue->push(mf);//ROOT mf
-            //     m_HASH_queue->push(new hash(BMT, HASH_id));
-            // } 
-            else {
-                gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
-                if (mf->is_write())
-                    gen_BMT_mf(mf->get_original_mf(), true, META_RBW, 8, 0);
-                assert(!m_HASH_queue->full());
+            } else {
+                if (mf->is_write()) {
+                    gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id);
+                    assert(!m_BMT_queue->full());
+                    gen_BMT_mf(mf, false, META_RBW, 128, 0);
+                } else
+                    gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
             }
-            
-            // if (REQ_addr == (new_addr_type) BMT_ROOT_mf) {
-            //     printf("AAAAAAAAAAAA\n");
-            //     BMT_busy = false;
-            // }
         }
     }
 
@@ -369,8 +353,7 @@ void mee::BMT_CHECK_cycle() {
     // }
 
     // CTR to BMT
-    if (!m_CTR_BMT_Buffer->empty() && !m_BMT_queue->full() && !m_HASH_queue->full() && !BMT_busy && m_BMT_table.empty()) {
-        assert(cnt==0);
+    if (!m_CTR_BMT_Buffer->empty() && !m_BMT_CHECK_queue->full() && !m_HASH_queue->full() && !BMT_busy) {
         // assert(cnt);
         mem_fetch *mf = m_CTR_BMT_Buffer->top();
             // gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, mf->get_id());
@@ -378,7 +361,6 @@ void mee::BMT_CHECK_cycle() {
             m_HASH_queue->push(new hash(BMT, mf->get_id()));
             m_CTR_BMT_Buffer->pop();
             BMT_busy = true;
-            cnt++;
     }
 }
 
@@ -393,7 +375,9 @@ void mee::CTR_cycle() {
                 // print_addr("MISS OTP:\t\t", mf_return);
             if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
                 m_OTP_queue->push(new unsigned(mf_return->get_id()));   //得到CTR值，计算OTP用于解密
+                #ifdef BMT_Enable
                 m_CTR_BMT_Buffer->push(mf_return);
+                #endif
                 m_CTR_RET_queue->pop();
             }
         }
@@ -421,7 +405,9 @@ void mee::CTR_cycle() {
         if (status == HIT) {
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
+                #ifdef BMT_Enable
                 m_CTR_BMT_Buffer->push(mf);
+                #endif
             }
             if (mf->get_access_type() != META_RBW) {
                 m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
@@ -534,9 +520,9 @@ void mee::BMT_cycle() {
         // print_addr("MAC cycle access:\t\t", mf);
         // assert(mf->get_access_type() == mf->get_access_type());
 
-        if (mf->is_write()) {
+        if (mf->get_access_type() == META_RBW) {
             //对于BMT写，要等待上一层BMT Hash计算完，得到新的BMT值，才可以更新当前层BMT
-            if (!m_BMT_set[mf->get_id()]) { 
+            if (m_BMTcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
                 return;
             }
         }
@@ -546,9 +532,6 @@ void mee::BMT_cycle() {
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
         // print_addr("CTR cycle access:\t\t", mf);
-        if (mf->get_access_type() == META_RBW) {
-            assert(status == HIT);
-        }
         if (status == HIT) {
             if (mf->get_access_type() != META_RBW) {
                 m_BMT_CHECK_queue->push(mf);
@@ -710,7 +693,9 @@ void mee::simple_cycle(unsigned cycle) {
                 mf->set_id(mf_counter);
                 gen_CTR_mf(mf, META_RBW, false, 0);
                 gen_CTR_mf(mf, META_ACC, true, mf_counter);
+                #ifdef MAC_Enable
                 gen_MAC_mf(mf, true, mf_counter);
+                #endif
                 // m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
                 m_Ciphertext_queue->push(mf);
                 m_unit->L2_mee_queue_pop(cycle&1);
@@ -724,7 +709,9 @@ void mee::simple_cycle(unsigned cycle) {
                 mf->set_id(mf_counter);
                 m_Ciphertext_queue->push(mf);
                 gen_CTR_mf(mf, META_ACC, false, mf_counter);
+                #ifdef MAC_Enable
                 gen_MAC_mf(mf, false, mf_counter);
+                #endif
                 m_unit->L2_mee_queue_pop(cycle&1);
             }
         } else {

From 662e4a77835422d9943722f70d06590dac013f1a Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 31 Aug 2024 21:22:10 +0800
Subject: [PATCH 113/133] mee v1.2.5

---
 src/gpgpu-sim/mee.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index e192ed70a..fe0cdca9d 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -38,7 +38,7 @@ int decode(int addr) {
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
     if (m_unit->get_mpid() == 0) {
-        printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id());        // print_tag();
+        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id());        // print_tag();
     }
 }
 

From 7a2dc01a91bde51d9d6a7fad4f6b165197bd1284 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Tue, 3 Sep 2024 23:41:07 +0800
Subject: [PATCH 114/133] mee v1.2.6

---
 src/gpgpu-sim/dram.cc |  3 ++-
 src/gpgpu-sim/mee.cc  | 23 ++++++++++++-----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index 545c45dfd..6360c6726 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -264,7 +264,8 @@ void dram_t::push(class mem_fetch *data) {
     max_mrqs_temp = (max_mrqs_temp > mrqq->get_length()) ? max_mrqs_temp
                                                          : mrqq->get_length();
   }
-  m_stats->memlatstat_dram_access(data);
+  if (data->get_sid() < 80)
+    m_stats->memlatstat_dram_access(data);
 }
 
 void dram_t::scheduler_fifo() {
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index fe0cdca9d..a20108584 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -10,7 +10,7 @@ mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class me
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
-    unsigned len = 64;
+    unsigned len = 8;
     m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
@@ -19,7 +19,7 @@ mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class me
     m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
-    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len + 100);
 
     m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
     m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
@@ -38,7 +38,7 @@ int decode(int addr) {
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
     if (m_unit->get_mpid() == 0) {
-        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id());        // print_tag();
+        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
     }
 }
 
@@ -157,8 +157,8 @@ void mee::meta_access(
     for (unsigned i = 0; i < reqs.size(); ++i) {
         assert(reqs.size() == 1);
         mem_fetch *req = reqs[i];
-        reqs[i]->set_id(mf_id);
-        reqs[i]->set_data_type(m_data_type);
+        req->set_id(mf_id);
+        req->set_data_type(m_data_type);
         assert(!m_META_queue->full());
         m_META_queue->push(req);
     }
@@ -229,6 +229,7 @@ void mee::AES_cycle() {
         int spid = m_unit->global_sub_partition_id_to_local_id(mf->get_sub_partition_id());
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
+        print_addr("waiting for AES:\t", mf);
         assert(OTP_id);
         // if (mf->is_write())
         //     printf("PPPPPPPPPPPPPP\n");
@@ -520,12 +521,12 @@ void mee::BMT_cycle() {
         // print_addr("MAC cycle access:\t\t", mf);
         // assert(mf->get_access_type() == mf->get_access_type());
 
-        if (mf->get_access_type() == META_RBW) {
-            //对于BMT写，要等待上一层BMT Hash计算完，得到新的BMT值，才可以更新当前层BMT
-            if (m_BMTcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
-                return;
-            }
-        }
+        // if (mf->get_access_type() == META_RBW) {
+        //     //对于BMT写，要等待上一层BMT Hash计算完，得到新的BMT值，才可以更新当前层BMT
+        //     if (m_BMTcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
+        //         return;
+        //     }
+        // }
 
         std::list<cache_event> events;
         enum cache_request_status status = m_BMTcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);

From 1c168bb64c9e4f1cad2772cca9b365527e82f706 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Wed, 4 Sep 2024 15:41:03 +0800
Subject: [PATCH 115/133] mee v1.2.6 fix:fill deadlock

---
 .../base_mee/gpgpusim.config_base_mee          |  2 +-
 src/gpgpu-sim/l2cache.cc                       |  3 +++
 src/gpgpu-sim/mee.cc                           | 18 +++++++++++-------
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index ece992cdb..16c3b2033 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -164,7 +164,7 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,320:0,32
+-gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dmeta N:4:128:4,L:B:m:L:X,A:8:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index e93103c4e..d1f3c4cd9 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -740,6 +740,9 @@ bool memory_partition_unit::mee_dram_queue_full(int size) const {
 }
 
 void memory_partition_unit::mee_dram_queue_push(class mem_fetch *mf) {
+  if (get_mpid() == 0) {
+        printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %lld\n", "mee to dram push:\t", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+    }
   m_mee_dram_queue->push(mf); //TODO
 }
 
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index a20108584..07414416a 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -1,7 +1,7 @@
 #include "mee.h"
 #include <list>
-#define BMT_Enable
-#define MAC_Enable
+// #define BMT_Enable
+// #define MAC_Enable
 
 mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
     m_unit(unit), 
@@ -229,7 +229,7 @@ void mee::AES_cycle() {
         int spid = m_unit->global_sub_partition_id_to_local_id(mf->get_sub_partition_id());
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
-        print_addr("waiting for AES:\t", mf);
+        // print_addr("waiting for AES:\t", mf);
         assert(OTP_id);
         // if (mf->is_write())
         //     printf("PPPPPPPPPPPPPP\n");
@@ -257,6 +257,7 @@ void mee::AES_cycle() {
                 // printf("IIIIIIIIIIIIIIII\n");
             }
         } else {
+            print_addr("waiting for AES:\t", mf);
             // if (mf->is_write()) 
             //     printf("%p %d AES waiting for OTP %d\n", mf, mf->get_sub_partition_id(), OTP_id);
         }
@@ -276,7 +277,7 @@ void mee::MAC_CHECK_cycle() {
     if (!m_MAC_CHECK_queue->empty()) {
         // printf("AAAAAAAAAAAAA\n");
         mem_fetch *mf = m_MAC_CHECK_queue->top();
-        print_addr("waiting for MAC Check:\t", mf);
+        // print_addr("waiting for MAC Check:\t", mf);
         new_addr_type REQ_addr = (new_addr_type) mf->get_original_mf();    //MAC Cache的值
         unsigned HASH_id = mf->get_id();    //MAC Hash值
         // if (mf->get_sub_partition_id() == 0) 
@@ -385,6 +386,7 @@ void mee::CTR_cycle() {
     }
 
     m_CTRcache->cycle();
+    CT_cycle();
     
     bool output_full = m_OTP_queue->full() || m_CTR_RET_queue->full() || m_CTR_BMT_Buffer->full();
     bool port_free = m_unit->m_CTRcache->data_port_free();
@@ -426,6 +428,8 @@ void mee::CTR_cycle() {
             assert(!read_sent);
         }
     }
+
+    // m_CTRcache->cycle();
 };
 
 void mee::MAC_cycle() {
@@ -555,7 +559,7 @@ void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_f
             m_META_RET_queue->push(mf);
         // assert(mf->get_access_type() == META_ACC);
         // if (m_METAcache == m_BMTcache)
-        print_addr("fill responses:", mf);
+        print_addr("fill responses:\t", mf);
         // reply(m_METAcache, mf);
         // delete mf;
     } else {
@@ -586,7 +590,7 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
             // }
             m_unit->dram_mee_queue_pop();
         }
-    } else if ((mf->get_data_type() == m_data_type) && !m_META_RET_queue->full()) {
+    } else if (mf->get_data_type() == m_data_type) {
       if (mf->is_write() && mf->get_type() == WRITE_ACK)
         mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
                        m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
@@ -741,7 +745,7 @@ void mee::simple_cycle(unsigned cycle) {
     BMT_cycle();
     AES_cycle();
     CTR_cycle();
-    CT_cycle();
+    // CT_cycle();
 }
 
 void mee::cycle(unsigned cycle) {

From fb3eed9d04937f15c29dcedb847851cd12fa533b Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Thu, 5 Sep 2024 03:21:24 +0800
Subject: [PATCH 116/133] mee v1.2.7

---
 src/gpgpu-sim/l2cache.cc |  2 +-
 src/gpgpu-sim/mee.cc     | 14 ++++++++++----
 src/gpgpu-sim/mee.h      |  1 +
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index d1f3c4cd9..adc47e0f8 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -741,7 +741,7 @@ bool memory_partition_unit::mee_dram_queue_full(int size) const {
 
 void memory_partition_unit::mee_dram_queue_push(class mem_fetch *mf) {
   if (get_mpid() == 0) {
-        printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %lld\n", "mee to dram push:\t", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %lld\n", "mee to dram push:\t", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
     }
   m_mee_dram_queue->push(mf); //TODO
 }
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 07414416a..566c1cb99 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -1,7 +1,7 @@
 #include "mee.h"
 #include <list>
-// #define BMT_Enable
-// #define MAC_Enable
+#define BMT_Enable
+#define MAC_Enable
 
 mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
     m_unit(unit), 
@@ -331,6 +331,7 @@ void mee::BMT_CHECK_cycle() {
             if (mf->get_data_type() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
+                BMT_counter++;
             } else {
                 if (mf->is_write()) {
                     gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id);
@@ -391,7 +392,7 @@ void mee::CTR_cycle() {
     bool output_full = m_OTP_queue->full() || m_CTR_RET_queue->full() || m_CTR_BMT_Buffer->full();
     bool port_free = m_unit->m_CTRcache->data_port_free();
 
-    if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
+    if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free && CTR_counter <= BMT_counter) {
         mem_fetch *mf = m_CTR_queue->top();
         // print_addr("CTR cycle access:\t\t", mf);
 
@@ -410,6 +411,7 @@ void mee::CTR_cycle() {
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
                 #ifdef BMT_Enable
                 m_CTR_BMT_Buffer->push(mf);
+                CTR_counter++;
                 #endif
             }
             if (mf->get_access_type() != META_RBW) {
@@ -421,8 +423,12 @@ void mee::CTR_cycle() {
             // set wating for CTR fill
             // print_addr("CTR cycle access:\t\t", mf);
             m_CTR_queue->pop();
-            if (mf->get_access_type() != META_RBW)
+            if (mf->get_access_type() != META_RBW) {
                 OTP_counter++;
+                #ifdef BMT_Enable
+                CTR_counter++;
+                #endif
+            }
         } else {
             assert(!write_sent);
             assert(!read_sent);
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index e23aa8656..52b152e10 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -119,6 +119,7 @@ class mee {
         unsigned CT_counter = 0;
         unsigned OTP_counter = 0;
         unsigned MAC_counter = 0;
+        unsigned CTR_counter = 0;
         unsigned BMT_counter = 0;
         int var;
 

From d8b8007de0350e3a769b10aaac7d87877391250c Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Fri, 6 Sep 2024 01:11:39 +0800
Subject: [PATCH 117/133] mee v1.2.8

---
 src/gpgpu-sim/mee.cc | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 566c1cb99..a9668af60 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -327,6 +327,7 @@ void mee::BMT_CHECK_cycle() {
         if (m_BMT_set[HASH_id] && !m_BMT_queue->full(2)) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
             m_BMT_set[HASH_id]--;
             m_BMT_CHECK_queue->pop();
+            print_addr("BMT Hash:\t", mf);
             //计算下一层BMT
             if (mf->get_data_type() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
@@ -334,9 +335,9 @@ void mee::BMT_CHECK_cycle() {
                 BMT_counter++;
             } else {
                 if (mf->is_write()) {
-                    gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id);
+                    gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id); // Lazy fetch on read策略下，写操作不会发给dram
                     assert(!m_BMT_queue->full());
-                    gen_BMT_mf(mf, false, META_RBW, 128, 0);
+                    gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
                 } else
                     gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
             }
@@ -410,19 +411,21 @@ void mee::CTR_cycle() {
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
                 #ifdef BMT_Enable
+                print_addr("CTR Write:\t", mf);
                 m_CTR_BMT_Buffer->push(mf);
                 CTR_counter++;
                 #endif
             }
-            if (mf->get_access_type() != META_RBW) {
+            else if (mf->get_access_type() != META_RBW) {
                 m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
                 OTP_counter++;
             }
             // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
-            // print_addr("CTR cycle access:\t\t", mf);
+            print_addr("CTR MISS:\t", mf);
             m_CTR_queue->pop();
+            assert(!mf->is_write());
             if (mf->get_access_type() != META_RBW) {
                 OTP_counter++;
                 #ifdef BMT_Enable
@@ -505,7 +508,7 @@ void mee::BMT_cycle() {
     if (!m_BMT_RET_queue->empty()) {
         mem_fetch *mf_return = m_BMT_RET_queue->top();
         // print_addr("MISS OTP:\t\t", mf_return);
-        if (mf_return->get_access_type() != META_RBW) {
+        if (!mf_return->is_write()) {
             if (!m_BMT_CHECK_queue->full() && !m_HASH_queue->full()) {
                 m_BMT_CHECK_queue->push(mf_return);
                 m_HASH_queue->push(new hash(BMT, mf_return->get_id()));
@@ -528,7 +531,7 @@ void mee::BMT_cycle() {
 
     if (!m_BMT_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
         mem_fetch *mf = m_BMT_queue->top();
-        // print_addr("MAC cycle access:\t\t", mf);
+        print_addr("BMT waiting access:\t", mf);
         // assert(mf->get_access_type() == mf->get_access_type());
 
         // if (mf->get_access_type() == META_RBW) {
@@ -544,14 +547,17 @@ void mee::BMT_cycle() {
         bool read_sent = was_read_sent(events);
         // print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
-            if (mf->get_access_type() != META_RBW) {
+            print_addr("BMT access HIT:\t", mf);
+            if (!mf->is_write()) {
                 m_BMT_CHECK_queue->push(mf);
                 m_HASH_queue->push(new hash(BMT, mf->get_id()));
             }
             m_BMT_queue->pop();
         } else if (status != RESERVATION_FAIL) {
+            print_addr("BMT access MISS:\t", mf);
             m_BMT_queue->pop();
         } else {
+            print_addr("BMT access reservation_fail:\t", mf);
             assert(!write_sent);
             assert(!read_sent);
         }
@@ -702,7 +708,7 @@ void mee::simple_cycle(unsigned cycle) {
                 // if (!m_Ciphertext_queue->full()) {
                 mf_counter++;
                 mf->set_id(mf_counter);
-                gen_CTR_mf(mf, META_RBW, false, 0);
+                gen_CTR_mf(mf, META_ACC, false, mf_counter);//Lazy_ftech_on_read
                 gen_CTR_mf(mf, META_ACC, true, mf_counter);
                 #ifdef MAC_Enable
                 gen_MAC_mf(mf, true, mf_counter);

From a2051f92e914795e2d5aade9a3c5131446533d0e Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Wed, 11 Sep 2024 03:39:05 +0800
Subject: [PATCH 118/133] mee v1.2.9

---
 .../base_mee/gpgpusim.config_base_mee         |  16 +-
 .../base_mee_normal/accelwattch_ptx_sim.xml   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../base_mee_normal/accelwattch_sass_hw.xml   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../base_mee_normal/accelwattch_sass_sim.xml  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../base_mee_normal/config_volta_islip.icnt   |  74 +++
 .../gpgpusim.config_base_mee_normal           | 250 +++++++
 .../base_mee_sector/accelwattch_ptx_sim.xml   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../base_mee_sector/accelwattch_sass_hw.xml   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../base_mee_sector/accelwattch_sass_sim.xml  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../base_mee_sector/config_volta_islip.icnt   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 src/gpgpu-sim/gpu-cache.cc                    |   3 +
 src/gpgpu-sim/l2cache.cc                      |  14 +-
 src/gpgpu-sim/mee.cc                          | 121 +++-
 src/gpgpu-sim/mee.h                           |  24 +-
 21 files changed, 8161 insertions(+), 61 deletions(-)
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_normal/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
index 16c3b2033..c344ca2ca 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
@@ -150,7 +150,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 #-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,S:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
@@ -164,7 +164,7 @@
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
 -gpgpu_cache:dmeta N:4:128:4,L:B:m:L:X,A:8:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
@@ -173,16 +173,16 @@
 -gpgpu_memory_partition_indexing 2
 
 # 128 KB Inst.
-#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
--gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
 # Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
-#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
--gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
 # 64 KB Const
-#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
--gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_normal/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
new file mode 100644
index 000000000..16c3b2033
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
+-gpgpu_cache:dmeta N:4:128:4,L:B:m:L:X,A:8:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..1c68a53c0
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:8:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 6e5b8eda7..bcc7f989c 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -600,6 +600,7 @@ mem_fetch *mshr_table::next_access() {
   new_addr_type block_addr = m_current_response.front();
   assert(!m_data[block_addr].m_list.empty());
   mem_fetch *result = m_data[block_addr].m_list.front();
+  // printf("cache fill response: data size: %d\taccess size:%d\n", result->get_data_size(), result->get_access_size());
   m_data[block_addr].m_list.pop_front();
   if (m_data[block_addr].m_list.empty()) {
     // release entry
@@ -1054,6 +1055,7 @@ bool baseline_cache::bandwidth_management::fill_port_free() const {
 void baseline_cache::cycle() {
   if (!m_miss_queue.empty()) {
     mem_fetch *mf = m_miss_queue.front();
+    // printf("%s cache cycle: data size: %d\taccess size:%d\n", m_name.c_str(), mf->get_data_size(), mf->get_access_size());
     if (!m_memport->full(mf->size(), mf->get_is_write())) {
       m_miss_queue.pop_front();
       m_memport->push(mf);
@@ -1068,6 +1070,7 @@ void baseline_cache::cycle() {
 /// Interface for response from lower memory level (model bandwidth restictions
 /// in caller)
 void baseline_cache::fill(mem_fetch *mf, unsigned time) {
+  // printf("%s cache fill: data size: %d\taccess size:%d\taccess type:%d\n", m_name.c_str(), mf->get_data_size(), mf->get_access_size(), mf->get_access_type());
   if (m_config.m_mshr_type == SECTOR_ASSOC) {
     assert(mf->get_original_mf());
     extra_mf_fields_lookup::iterator e =
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index adc47e0f8..7f374d125 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -82,20 +82,24 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
       m_gpu(gpu) {
   m_dram = new dram_t(m_id, m_config, m_stats, this, gpu);
 
-  char L2c_name[32];
-  snprintf(L2c_name, 32, "L2_bank_%03d", m_id);
+  char CTRc_name[32];
+  char MACc_name[32];
+  char BMTc_name[32];
+  snprintf(CTRc_name, 32, "CTR_bank_%03d\0", m_id);
+  snprintf(MACc_name, 32, "MAC_bank_%03d\0", m_id);
+  snprintf(BMTc_name, 32, "BMT_bank_%03d\0", m_id);
   m_metainterface = new metainterface(this);
   m_mf_allocator = new partition_mf_allocator(config);
 
   if (!m_config->m_META_config.disabled()) {
     m_CTRcache =
-        new meta_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(CTRc_name, m_config->m_META_config, -1, -1, m_metainterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
     m_MACcache =
-        new meta_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(MACc_name, m_config->m_META_config, -1, -1, m_metainterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
     m_BMTcache =
-        new meta_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(BMTc_name, m_config->m_META_config, -1, -1, m_metainterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
   }
 
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index a9668af60..3aaefbaee 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -38,7 +38,7 @@ int decode(int addr) {
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
     if (m_unit->get_mpid() == 0) {
-        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+        printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
     }
 }
 
@@ -87,44 +87,61 @@ new_addr_type mee::get_addr(new_addr_type sub_partition_id, new_addr_type partit
     return new_addr;
 }
 
-void mee::gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr, unsigned mf_id) {
+void mee::gen_CTR_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
-    partition_addr = partition_addr >> 14 << 7;
+    // new_addr_type minor_addr = (partition_addr >> 7) & 127;
+    // minor_addr = 128 + minor_addr * 7;
+    // bool res = minor_addr & 7 > 1;
+    // minor_addr >>= 3;
+    partition_addr = (partition_addr >> 14 << 7);
+
+    // if (meta_acc == META_ACC)
+    //     partition_addr |= minor_addr;
+
     new_addr_type CTR_addr  = get_addr(sub_partition_id, partition_addr);
     CTR_addr |= CTR_base;
 
+    // if (meta_acc == META_ACC && res)
+    //     size <<= 1;
+
     meta_access(m_CTR_queue, CTR_addr, meta_acc, 
-            128, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
             mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, CTR);
 }
 
-void mee::gen_MAC_mf(mem_fetch *mf, bool wr, unsigned mf_id) {
+void mee::gen_MAC_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
-    partition_addr = partition_addr >> 7 << 3;
+    if (m_config->m_META_config.m_cache_type == SECTOR)
+        partition_addr = partition_addr >> 5 << 1;
+    else
+        partition_addr = partition_addr >> 7 << 3;
     new_addr_type MAC_addr  = get_addr(sub_partition_id, partition_addr);
     MAC_addr |= MAC_base;
 
-    meta_access(m_MAC_queue, MAC_addr, META_ACC, 
-            8, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+    meta_access(m_MAC_queue, MAC_addr, meta_acc, 
+            size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
             mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, MAC);
 }
 
-void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size, unsigned mf_id) {
+void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id) {
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
     // unsigned int Layer = get_BMT_Layer(mf->get_addr());
     // if (Layer == 4) //由L4生成ROOT，由于ROOT是单独的寄存器，这里不生成访存请求
     //     return;
-    partition_addr = partition_addr & 0x003fffff;
-    partition_addr = partition_addr >> 7 << 3;
+    partition_addr = partition_addr & 0x007fffff;
+    if (size == 128)
+        partition_addr = partition_addr >> 11 << 7;
+    else
+        partition_addr = partition_addr >> 7 << 3;
     new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
-    BMT_addr |= 0xf2000000;
+    BMT_addr |= 0xE4000000;
 
     enum data_type BMT_type = static_cast<data_type>(mf->get_data_type() + 1);
 
-    meta_access(m_BMT_queue, BMT_addr, type, 
+    meta_access(m_BMT_queue, BMT_addr, meta_acc, 
             size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
             mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, BMT_type);
 }
@@ -136,12 +153,14 @@ void mee::meta_access(
 
     mem_access_byte_mask_t byte_mask;
     mem_access_sector_mask_t sector_mask;
-    for (unsigned i = 0; i < size; i++) byte_mask.set(i);
+    for (unsigned i = addr & 127; i < (addr & 127) + size; i++) byte_mask.set(i);
     if (size == 128)
         for (unsigned i = 0; i < size / 32; i++) 
             sector_mask.set(i);
     else
-        sector_mask.set((addr >> 5) & 3);
+        for (unsigned i = (addr >> 5) & 3; i < ((addr >> 5) & 3) + ((size + 31) / 32); i++) 
+            sector_mask.set(i);
+        // sector_mask.set((addr >> 5) & 3);
 
     mem_access_t acc(type, addr, size, wr, original_mf->get_access_warp_mask(), byte_mask, sector_mask, m_gpu->gpgpu_ctx);
     mem_fetch *mf = new mem_fetch(
@@ -155,10 +174,14 @@ void mee::meta_access(
         reqs.push_back(mf);
 
     for (unsigned i = 0; i < reqs.size(); ++i) {
-        assert(reqs.size() == 1);
+        // assert(reqs.size() == 1);
         mem_fetch *req = reqs[i];
-        req->set_id(mf_id);
+        // req->set_id(mf_id);
         req->set_data_type(m_data_type);
+        if (i == reqs.size() - 1)
+            req->set_id(mf_id);
+        else
+            req->set_id(0);
         assert(!m_META_queue->full());
         m_META_queue->push(req);
     }
@@ -324,7 +347,7 @@ void mee::BMT_CHECK_cycle() {
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
         // assert(mf);
-        if (m_BMT_set[HASH_id] && !m_BMT_queue->full(2)) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
+        if (m_BMT_set[HASH_id] && ((m_config->m_META_config.m_cache_type == SECTOR && !m_BMT_queue->full(5)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_BMT_queue->full(2)))) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
             m_BMT_set[HASH_id]--;
             m_BMT_CHECK_queue->pop();
             print_addr("BMT Hash:\t", mf);
@@ -332,7 +355,8 @@ void mee::BMT_CHECK_cycle() {
             if (mf->get_data_type() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
-                BMT_counter++;
+                if (mf->get_id())
+                    BMT_counter++;
             } else {
                 if (mf->is_write()) {
                     gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id); // Lazy fetch on read策略下，写操作不会发给dram
@@ -371,7 +395,7 @@ void mee::BMT_CHECK_cycle() {
 void mee::CTR_cycle() {
     if (!m_CTR_RET_queue->empty()) {
         mem_fetch *mf_return = m_CTR_RET_queue->top();
-        if (mf_return->get_access_type() == META_RBW) {    //更新CTR前的CTR读MISS返回
+        if (!mf_return->get_id() || mf_return->get_access_type() == META_RBW) {    //更新CTR前的CTR读MISS返回
             m_CTR_RET_queue->pop();
             // delete mf_return;//删除1
         } else {    //CTR读MISS返回，CTR写一定命中
@@ -412,13 +436,17 @@ void mee::CTR_cycle() {
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
                 #ifdef BMT_Enable
                 print_addr("CTR Write:\t", mf);
-                m_CTR_BMT_Buffer->push(mf);
-                CTR_counter++;
+                if (mf->get_id())
+                    m_CTR_BMT_Buffer->push(mf);
+                if (mf->get_id())
+                    CTR_counter++;
                 #endif
             }
             else if (mf->get_access_type() != META_RBW) {
-                m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
-                OTP_counter++;
+                if (mf->get_id())
+                    m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
+                if (mf->get_id())
+                    OTP_counter++;
             }
             // }
         } else if (status != RESERVATION_FAIL) {
@@ -427,9 +455,11 @@ void mee::CTR_cycle() {
             m_CTR_queue->pop();
             assert(!mf->is_write());
             if (mf->get_access_type() != META_RBW) {
-                OTP_counter++;
+                if (mf->get_id())
+                    OTP_counter++;
                 #ifdef BMT_Enable
-                CTR_counter++;
+                if (mf->get_id())
+                    CTR_counter++;
                 #endif
             }
         } else {
@@ -508,7 +538,7 @@ void mee::BMT_cycle() {
     if (!m_BMT_RET_queue->empty()) {
         mem_fetch *mf_return = m_BMT_RET_queue->top();
         // print_addr("MISS OTP:\t\t", mf_return);
-        if (!mf_return->is_write()) {
+        if (mf_return->get_id() && !mf_return->is_write()) {
             if (!m_BMT_CHECK_queue->full() && !m_HASH_queue->full()) {
                 m_BMT_CHECK_queue->push(mf_return);
                 m_HASH_queue->push(new hash(BMT, mf_return->get_id()));
@@ -548,7 +578,7 @@ void mee::BMT_cycle() {
         // print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
             print_addr("BMT access HIT:\t", mf);
-            if (!mf->is_write()) {
+            if (mf->get_id() && !mf->is_write()) {
                 m_BMT_CHECK_queue->push(mf);
                 m_HASH_queue->push(new hash(BMT, mf->get_id()));
             }
@@ -567,7 +597,7 @@ void mee::BMT_cycle() {
 void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK) {
     if (m_METAcache->access_ready() && !m_META_RET_queue->full()) {
         mem_fetch *mf = m_METAcache->next_access();
-        if (mf->get_access_type() == META_ACC)
+        if (mf->get_access_type() == META_ACC && mf->get_id())
             m_META_RET_queue->push(mf);
         // assert(mf->get_access_type() == META_ACC);
         // if (m_METAcache == m_BMTcache)
@@ -700,7 +730,8 @@ void mee::simple_cycle(unsigned cycle) {
         assert(mf->is_raw());
         // printf("TTTTTTTTTTTTTTTT\n");
         
-        if (!m_CTR_queue->full(2) && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
+        if (((m_config->m_META_config.m_cache_type == SECTOR && !m_CTR_queue->full(8)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_CTR_queue->full(2)))
+            && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
             // assert(!mf->is_write());
             if (mf->is_write()) { // write
                 assert(mf->is_raw());
@@ -708,11 +739,21 @@ void mee::simple_cycle(unsigned cycle) {
                 // if (!m_Ciphertext_queue->full()) {
                 mf_counter++;
                 mf->set_id(mf_counter);
-                gen_CTR_mf(mf, META_ACC, false, mf_counter);//Lazy_ftech_on_read
-                gen_CTR_mf(mf, META_ACC, true, mf_counter);
+
+                // gen_CTR_mf(mf, false, META_RBW, 16, mf_counter);//Lazy_ftech_on_read
+                // gen_CTR_mf(mf, false, META_ACC,  1, mf_counter);//Lazy_ftech_on_read
+                // gen_CTR_mf(mf, true,  META_RBW, 16, mf_counter);
+                // gen_CTR_mf(mf, true,  META_ACC,  1, mf_counter);
+                gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);//Lazy_ftech_on_read
+                gen_CTR_mf(mf, true,  META_ACC, 128, mf_counter);
+
                 #ifdef MAC_Enable
-                gen_MAC_mf(mf, true, mf_counter);
+                if (m_config->m_META_config.m_cache_type == SECTOR)
+                    gen_MAC_mf(mf, true, META_ACC, 2, mf_counter);
+                else
+                    gen_MAC_mf(mf, true, META_ACC, 8, mf_counter);
                 #endif
+
                 // m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
                 m_Ciphertext_queue->push(mf);
                 m_unit->L2_mee_queue_pop(cycle&1);
@@ -725,9 +766,12 @@ void mee::simple_cycle(unsigned cycle) {
                 mf_counter++;
                 mf->set_id(mf_counter);
                 m_Ciphertext_queue->push(mf);
-                gen_CTR_mf(mf, META_ACC, false, mf_counter);
+                gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);
                 #ifdef MAC_Enable
-                gen_MAC_mf(mf, false, mf_counter);
+                if (m_config->m_META_config.m_cache_type == SECTOR)
+                    gen_MAC_mf(mf, false, META_ACC, 2, mf_counter);
+                else
+                    gen_MAC_mf(mf, false, META_ACC, 8, mf_counter);
                 #endif
                 m_unit->L2_mee_queue_pop(cycle&1);
             }
@@ -806,4 +850,9 @@ void mee::cycle(unsigned cycle) {
 //ok 增加访存类型的属性
 //ok 单个HASH单元
 //ok None Sector
-//lazy_fetch_on_read不能和None_Sector混用，因为设置modified会Sector_MISS
\ No newline at end of file
+//lazy_fetch_on_read不能和None_Sector混用，因为设置modified会Sector_MISS
+
+//Sector
+//deepbench
+//可配置
+//lazy_fetch_on_read
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 52b152e10..603470e41 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -32,9 +32,9 @@ class mee {
         new_addr_type get_addr(new_addr_type partition_id, new_addr_type partition_addr);
 
         unsigned int get_BMT_Layer(new_addr_type addr);
-        void gen_CTR_mf(mem_fetch *mf, mem_access_type meta_acc, bool wr, unsigned mf_id);
-        void gen_MAC_mf(mem_fetch *mf, bool wr, unsigned mf_id);
-        void gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type type, unsigned size, unsigned mf_id);
+        void gen_CTR_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id);
+        void gen_MAC_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id);
+        void gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id);
         bool META_queue_empty();
 
         void META_fill_responses(class meta_cache *m_METAcache,  fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK);
@@ -89,16 +89,16 @@ class mee {
         const new_addr_type CTR_mask = 0xFE000000;//1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
         const new_addr_type MAC_mask = 0xF0000000;//1110 xxxx xxxx xxxx xxxx xxxx xxxx x000
         
-        //CTR: 1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
-        //L1 : 1111 0010 000x xxxx xxxx xxxx xxxx x000
-        //L2 : 1111 0010 0010 000x xxxx xxxx xxxx x000
-        //L3 : 1111 0010 0010 0010 00xx xxxx 0xxx x000
-        //L4 : 1111 0010 0010 0010 00xx xxxx 1000 0000
-        //ROOT:1111 0010 0010 0010 00xx xxxx 1000 1000 
-        const new_addr_type BMT_base[5] = {0xF0000000, 0xF2000000, 0xF2200000, 0xF2220000, 0xF2220080};
+        //CTR: 1110 00xx xxxx xxxx xxxx xxxx xxxx xxxx
+        //L1 : 1110 0100 00xx xxxx xxxx xxxx xxxx x000
+        //L2 : 1110 0100 0100 00xx xxxx xxxx xxxx x000
+        //L3 : 1110 0100 0100 0100 00xx xxxx xxxx x000
+        //L4 : 1110 0100 0100 0100 01xx xxxx 0000 x000
+        //ROOT:1110 0100 0100 0100 01xx xxxx 0001 0000 
+        const new_addr_type BMT_base[5] = {0xE0000000, 0xE4000000, 0xE4400000, 0xE4440000, 0xE4444000};
         
-        const new_addr_type CTR_base = 0xF0000000;//1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
-        const new_addr_type MAC_base = 0xE0000000;//1110 xxxx xxxx xxxx xxxx xxxx xxxx x000
+        const new_addr_type CTR_base = 0xE0000000;//1110 00xx xxxx xxxx xxxx xxxx xxxx xxxx
+        const new_addr_type MAC_base = 0xC0000000;//110x xxxx xxxx xxxx xxxx xxxx xxxx x000
 
         const int m_memcpy_cycle_offset = 0;
         const int mee_busy_mask = 0;

From 4c281e21b0c3f670229dab0a7dbfa84467ff0561 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Wed, 11 Sep 2024 03:41:33 +0800
Subject: [PATCH 119/133] mee v1.2.9

---
 src/gpgpu-sim/mee.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 3aaefbaee..701d5902b 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -38,7 +38,7 @@ int decode(int addr) {
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
     if (m_unit->get_mpid() == 0) {
-        printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
     }
 }
 

From 5f78321f42f75d263d55293f5f4b377483466dfe Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Fri, 13 Sep 2024 00:51:36 +0800
Subject: [PATCH 120/133] mee v1.2.9

---
 src/gpgpu-sim/l2cache.cc |  2 +-
 src/gpgpu-sim/mee.cc     | 97 ++++++++++++++++++++++++++++------------
 src/gpgpu-sim/mee.h      | 18 ++++----
 3 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 7f374d125..6b0b70d9b 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -622,7 +622,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   // new L2 texture accesses and/or non-texture accesses
   if (!m_L2_mee_queue->full() && !m_icnt_L2_queue->empty()) {
     mem_fetch *mf = m_icnt_L2_queue->top();
-                // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 access\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+                // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 access\t", mf->get_addr(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type());
 
     if (!m_config->m_L2_config.disabled() &&
         ((m_config->m_L2_texure_only && mf->istexture()) ||
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 701d5902b..f55afbf9d 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -1,6 +1,6 @@
 #include "mee.h"
 #include <list>
-#define BMT_Enable
+// #define BMT_Enable
 #define MAC_Enable
 
 mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
@@ -10,7 +10,7 @@ mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class me
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
-    unsigned len = 8;
+    unsigned len = 16;
     m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
@@ -37,7 +37,7 @@ int decode(int addr) {
     return (addr & 16128) >> 8;
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
-    if (m_unit->get_mpid() == 0) {
+    if (m_unit->get_mpid() == 12) {
         // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
     }
 }
@@ -131,13 +131,13 @@ void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned
     // unsigned int Layer = get_BMT_Layer(mf->get_addr());
     // if (Layer == 4) //由L4生成ROOT，由于ROOT是单独的寄存器，这里不生成访存请求
     //     return;
-    partition_addr = partition_addr & 0x007fffff;
+    partition_addr = partition_addr & 0x003fffff;
     if (size == 128)
         partition_addr = partition_addr >> 11 << 7;
     else
         partition_addr = partition_addr >> 7 << 3;
     new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
-    BMT_addr |= 0xE4000000;
+    BMT_addr |= 0xF2000000;
 
     enum data_type BMT_type = static_cast<data_type>(mf->get_data_type() + 1);
 
@@ -153,16 +153,22 @@ void mee::meta_access(
 
     mem_access_byte_mask_t byte_mask;
     mem_access_sector_mask_t sector_mask;
+    unsigned data_size = 0;
     for (unsigned i = addr & 127; i < (addr & 127) + size; i++) byte_mask.set(i);
-    if (size == 128)
+    if (size == 128) {
         for (unsigned i = 0; i < size / 32; i++) 
             sector_mask.set(i);
-    else
+        addr = addr >> 7 << 7;
+        data_size = 128;
+    }
+    else {
         for (unsigned i = (addr >> 5) & 3; i < ((addr >> 5) & 3) + ((size + 31) / 32); i++) 
             sector_mask.set(i);
+        addr = addr >> 5 << 5;
+        data_size = 32;
         // sector_mask.set((addr >> 5) & 3);
-
-    mem_access_t acc(type, addr, size, wr, original_mf->get_access_warp_mask(), byte_mask, sector_mask, m_gpu->gpgpu_ctx);
+    }
+    mem_access_t acc(type, addr, data_size, wr, original_mf->get_access_warp_mask(), byte_mask, sector_mask, m_gpu->gpgpu_ctx);
     mem_fetch *mf = new mem_fetch(
         acc, NULL /*we don't have an instruction yet*/, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
         wid, sid, tpc, m_config, cycle, original_mf);
@@ -173,6 +179,8 @@ void mee::meta_access(
     else
         reqs.push_back(mf);
 
+    assert(m_data_type != MAC || reqs.size() == 1);
+
     for (unsigned i = 0; i < reqs.size(); ++i) {
         // assert(reqs.size() == 1);
         mem_fetch *req = reqs[i];
@@ -307,13 +315,14 @@ void mee::MAC_CHECK_cycle() {
         //     printf("%x\n", OTP_addr);
         assert(HASH_id);
         if (m_MAC_set[HASH_id]) { //得到了MAC与Hash值，MAC Check完成
+            if (m_unit->get_mpid() == 12)
             // printf("MAC check: id %d sid %d\n", HASH_id, mf->get_sub_partition_id());
             m_MAC_set[HASH_id]--;
             // m_MAC_table[REQ_addr] = 0;
             m_MAC_CHECK_queue->pop();
             // printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         } else {
-            // print_addr("waiting for MAC Check:\t", mf);
+            print_addr("waiting for MAC Check:\t", mf);
             // if (mf->get_sub_partition_id() == 32) 
                 // printf("%p %d MAC waiting for HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         }
@@ -359,11 +368,22 @@ void mee::BMT_CHECK_cycle() {
                     BMT_counter++;
             } else {
                 if (mf->is_write()) {
-                    gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id); // Lazy fetch on read策略下，写操作不会发给dram
-                    assert(!m_BMT_queue->full());
-                    gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
-                } else
-                    gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
+                    if (m_config->m_META_config.m_cache_type == SECTOR) {
+                        gen_BMT_mf(mf, mf->is_write(), META_ACC, 2, HASH_id); // Lazy fetch on read策略下，写操作不会发给dram
+                        assert(!m_BMT_queue->full());
+                        gen_BMT_mf(mf, false, META_ACC, 32, HASH_id);
+                    } else {
+                        gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id); // Lazy fetch on read策略下，写操作不会发给dram
+                        assert(!m_BMT_queue->full());
+                        gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
+                    }
+                } else {
+                    if (m_config->m_META_config.m_cache_type == SECTOR) {
+                        gen_BMT_mf(mf, false, META_ACC, 32, HASH_id);
+                    } else {
+                        gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
+                    }
+                }
             }
         }
     }
@@ -495,6 +515,8 @@ void mee::MAC_cycle() {
         mem_fetch *mf = m_MAC_queue->top();
         // print_addr("MAC cycle access:\t\t", mf);
 
+        assert(mf->get_id());
+
         if (mf->is_write()) {   //对于写MAC请求，则应等待密文被Hash为新MAC值
             if (!m_MAC_set[mf->get_id()]) {
                 return;
@@ -517,14 +539,14 @@ void mee::MAC_cycle() {
             // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
-            // print_addr("MAC cycle access MISS:\t\t", mf);
+            print_addr("MAC cycle access MISS:\t\t", mf);
             if (mf->is_write()) {   //MAC写MISS，则MAC Hash值使用结束
                 // m_MAC_set[mf->get_id()]--;
             }
             m_MAC_queue->pop();
             MAC_counter++;
         } else {
-            // print_addr("CTR cycle RESERVATION_FAIL:\t", mf);
+            print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
             // if (get_sub_partition_id(mf) == 0)
             //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             // print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
@@ -605,9 +627,9 @@ void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_f
         // reply(m_METAcache, mf);
         // delete mf;
     } else {
-        // if (mf->get_sub_partition_id() == 32 && m_META_RET_queue->full()){
-        //     print_addr("fill responses ERROR:", mf);
-        // }
+        if (m_META_RET_queue->full()){
+            // printf("fill responses ERROR: %d\n", m_unit->get_mpid());
+        }
     }
 }
 
@@ -631,6 +653,8 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
             //     // printf("CTR Next: %p\n", m_CTR_queue->top());
             // }
             m_unit->dram_mee_queue_pop();
+        } else {
+            print_addr("fill ERROR:\t", mf);
         }
     } else if (mf->get_data_type() == m_data_type) {
       if (mf->is_write() && mf->get_type() == WRITE_ACK)
@@ -725,7 +749,7 @@ void mee::simple_cycle(unsigned cycle) {
         // if (mf->get_access_type() == 9)
                         // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 to mee:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
-        // print_addr("L2 to mee: ", mf);
+        print_addr("L2 to mee: ", mf);
         // mee to dram
         assert(mf->is_raw());
         // printf("TTTTTTTTTTTTTTTT\n");
@@ -744,8 +768,17 @@ void mee::simple_cycle(unsigned cycle) {
                 // gen_CTR_mf(mf, false, META_ACC,  1, mf_counter);//Lazy_ftech_on_read
                 // gen_CTR_mf(mf, true,  META_RBW, 16, mf_counter);
                 // gen_CTR_mf(mf, true,  META_ACC,  1, mf_counter);
-                gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);//Lazy_ftech_on_read
-                gen_CTR_mf(mf, true,  META_ACC, 128, mf_counter);
+                // gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);//Lazy_ftech_on_read
+                // gen_CTR_mf(mf, true,  META_ACC, 128, mf_counter);
+
+                if (m_config->m_META_config.m_cache_type == SECTOR) {
+                    gen_CTR_mf(mf, false, META_ACC, 32, mf_counter);//Lazy_ftech_on_read
+                    gen_CTR_mf(mf, true,  META_ACC, 32, mf_counter);
+                }
+                else {
+                    gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);//Lazy_ftech_on_read
+                    gen_CTR_mf(mf, true,  META_ACC, 128, mf_counter);
+                }
 
                 #ifdef MAC_Enable
                 if (m_config->m_META_config.m_cache_type == SECTOR)
@@ -766,7 +799,13 @@ void mee::simple_cycle(unsigned cycle) {
                 mf_counter++;
                 mf->set_id(mf_counter);
                 m_Ciphertext_queue->push(mf);
-                gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);
+                if (m_config->m_META_config.m_cache_type == SECTOR) {
+                    gen_CTR_mf(mf, false, META_ACC, 32, mf_counter);
+                }
+                else {
+                    gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);
+                }
+                // gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);
                 #ifdef MAC_Enable
                 if (m_config->m_META_config.m_cache_type == SECTOR)
                     gen_MAC_mf(mf, false, META_ACC, 2, mf_counter);
@@ -776,7 +815,7 @@ void mee::simple_cycle(unsigned cycle) {
                 m_unit->L2_mee_queue_pop(cycle&1);
             }
         } else {
-            // if (m_unit->get_mpid() <= 32){
+            // if (m_unit->get_mpid() == 0){
             //     if (m_CTR_RET_queue->full())
             //         printf("AAAAAAAAAAAAAAAAAAAAAA");
             //     if (m_MAC_RET_queue->full())
@@ -785,10 +824,10 @@ void mee::simple_cycle(unsigned cycle) {
             //         printf("CCCCCCCCCCCC");
             //     if (m_AES_queue->full())
             //         printf("DDDDDDDDDDDDDDDD");
-            //     // if (m_AES_queue->full())
-            //     //     printf("EEEEEEEEEEEEEEEE");
-            //     // if (m_unit->mee_dram_queue_empty())
-            //     //     printf("FFFFFFFFFFFFFFFFFF");
+            //     if (m_AES_queue->full())
+            //         printf("EEEEEEEEEEEEEEEE");
+            //     if (m_unit->mee_dram_queue_empty())
+            //         printf("FFFFFFFFFFFFFFFFFF");
             // }
                 
         }
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 603470e41..b18e118cf 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -89,16 +89,16 @@ class mee {
         const new_addr_type CTR_mask = 0xFE000000;//1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
         const new_addr_type MAC_mask = 0xF0000000;//1110 xxxx xxxx xxxx xxxx xxxx xxxx x000
         
-        //CTR: 1110 00xx xxxx xxxx xxxx xxxx xxxx xxxx
-        //L1 : 1110 0100 00xx xxxx xxxx xxxx xxxx x000
-        //L2 : 1110 0100 0100 00xx xxxx xxxx xxxx x000
-        //L3 : 1110 0100 0100 0100 00xx xxxx xxxx x000
-        //L4 : 1110 0100 0100 0100 01xx xxxx 0000 x000
-        //ROOT:1110 0100 0100 0100 01xx xxxx 0001 0000 
-        const new_addr_type BMT_base[5] = {0xE0000000, 0xE4000000, 0xE4400000, 0xE4440000, 0xE4444000};
+        //CTR: 1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
+        //L1 : 1111 0010 000x xxxx xxxx xxxx xxxx x000
+        //L2 : 1111 0010 0010 000x xxxx xxxx xxxx x000
+        //L3 : 1111 0010 0010 0010 00xx xxxx 0xxx x000
+        //L4 : 1111 0010 0010 0010 00xx xxxx 1000 0000
+        //ROOT:1111 0010 0010 0010 00xx xxxx 1000 1000 
+        const new_addr_type BMT_base[5] = {0xF0000000, 0xF2000000, 0xF2200000, 0xF2220000, 0xF2220080};
         
-        const new_addr_type CTR_base = 0xE0000000;//1110 00xx xxxx xxxx xxxx xxxx xxxx xxxx
-        const new_addr_type MAC_base = 0xC0000000;//110x xxxx xxxx xxxx xxxx xxxx xxxx x000
+        const new_addr_type CTR_base = 0xF0000000;//1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
+        const new_addr_type MAC_base = 0xE0000000;//1110 xxxx xxxx xxxx xxxx xxxx xxxx x000
 
         const int m_memcpy_cycle_offset = 0;
         const int mee_busy_mask = 0;

From 5a05cacb983646c07a8c4ef702409a9f2665bafb Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 14 Sep 2024 17:40:27 +0800
Subject: [PATCH 121/133] mee v1.2.9

---
 src/gpgpu-sim/mee.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index f55afbf9d..646a7859f 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -1,6 +1,6 @@
 #include "mee.h"
 #include <list>
-// #define BMT_Enable
+#define BMT_Enable
 #define MAC_Enable
 
 mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 

From 96eedb0f311e8f48e4a43b05057dfe46a696647f Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 14 Sep 2024 19:41:26 +0800
Subject: [PATCH 122/133] mee v1.2.9

---
 src/gpgpu-sim/mee.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 646a7859f..6c2e3956a 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -135,7 +135,7 @@ void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned
     if (size == 128)
         partition_addr = partition_addr >> 11 << 7;
     else
-        partition_addr = partition_addr >> 7 << 3;
+        partition_addr = partition_addr >> 9 << 5;
     new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
     BMT_addr |= 0xF2000000;
 

From 6ce08bf504abfb024aef1edcff53dcd266a83a2e Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 29 Sep 2024 19:30:36 +0800
Subject: [PATCH 123/133] mee v1.3.0

---
 .../gpgpusim.config_base_mee_normal           |  2 +-
 .../gpgpusim.config_base_mee_sector           |  4 +--
 src/gpgpu-sim/gpu-cache.h                     |  2 ++
 src/gpgpu-sim/gpu-sim.cc                      | 36 ++++++++++++++++++-
 src/gpgpu-sim/gpu-sim.h                       |  1 +
 src/gpgpu-sim/l2cache.cc                      | 16 +++++++++
 src/gpgpu-sim/l2cache.h                       | 10 +++++-
 src/gpgpu-sim/mee.cc                          | 10 +++---
 8 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
index 16c3b2033..b188ece63 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
--gpgpu_cache:dmeta N:4:128:4,L:B:m:L:X,A:8:4,32:0,32
+-gpgpu_cache:dmeta N:4:128:4,L:B:m:L:X,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
index 1c68a53c0..7c57ba2f2 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
@@ -165,8 +165,8 @@
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
-#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
--gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:8:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 8c223fd57..d4a5c244b 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1234,6 +1234,8 @@ class cache_stats {
   unsigned long long m_cache_port_available_cycles;
   unsigned long long m_cache_data_port_busy_cycles;
   unsigned long long m_cache_fill_port_busy_cycles;
+
+  friend class gpgpu_sim;
 };
 
 class cache_t {
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 5a1108e09..1fedf0d54 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1385,6 +1385,11 @@ void gpgpu_sim::gpu_print_METACache_stat(char META[]) {
       if (total_l2_css.accesses > 0)
         printf("%s_total_cache_miss_rate = %.4lf\n",
                META, (double)total_l2_css.misses / (double)total_l2_css.accesses);
+      //secondary MISS
+      printf("%s_total_cache_secondary_misses = %llu\n", META, l2_stats.m_stats[META_ACC][MSHR_HIT]);
+      //secondary MISS rate
+      if (total_l2_css.misses > 0)
+        printf("%s_total_cache_secondary_miss_rate = %.4lf\n", META, (double)l2_stats.m_stats[META_ACC][MSHR_HIT] / (double)total_l2_css.misses + (double)l2_stats.m_stats[META_ACC][MSHR_HIT]);
       printf("%s_total_cache_pending_hits = %llu\n", META, total_l2_css.pending_hits);
       printf("%s_total_cache_reservation_fails = %llu\n",
              META, total_l2_css.res_fails);
@@ -1400,7 +1405,7 @@ void gpgpu_sim::gpu_print_METACache_stat(char META[]) {
       char META_cache_stats_fail_breakdown[128];
       strcpy(META_cache_stats_fail_breakdown, META);
       strcat(META_cache_stats_fail_breakdown, "_cache_stats_fail_breakdown");
-      l2_stats.print_fail_stats(stdout, "L2_cache_stats_fail_breakdown");
+      l2_stats.print_fail_stats(stdout, META_cache_stats_fail_breakdown);
 
       char META_cache[128];
       strcpy(META_cache, META);
@@ -1410,6 +1415,32 @@ void gpgpu_sim::gpu_print_METACache_stat(char META[]) {
   }
 }
 
+void gpgpu_sim::gpu_print_METACache_data_type_breakdown() {
+
+  printf("\n========= meta cache data type breakdown =========\n");
+  
+  unsigned long long m_cache_tot_DEFAULT_acc = 0;
+  unsigned long long m_cache_tot_CTR_acc = 0;
+  unsigned long long m_cache_tot_MAC_acc = 0;
+  unsigned long long m_cache_tot_BMT_acc = 0;
+  unsigned long long m_cache_tot_meta_wb = 0;
+  
+  for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+    m_cache_tot_DEFAULT_acc += m_memory_partition_unit[i]->m_cache_DEFAULT_acc;
+    m_cache_tot_CTR_acc += m_memory_partition_unit[i]->m_cache_CTR_acc;
+    m_cache_tot_MAC_acc += m_memory_partition_unit[i]->m_cache_MAC_acc;
+    m_cache_tot_BMT_acc += m_memory_partition_unit[i]->m_cache_BMT_acc;
+    m_cache_tot_meta_wb += m_memory_partition_unit[i]->m_cache_meta_wb;
+  }
+
+  printf("m_cache_tot_DEFAULT_acc = %lld\n", m_cache_tot_DEFAULT_acc);
+  printf("m_cache_tot_CTR_acc = %lld\n", m_cache_tot_CTR_acc);
+  printf("m_cache_tot_MAC_acc = %lld\n", m_cache_tot_MAC_acc);
+  printf("m_cache_tot_BMT_acc = %lld\n", m_cache_tot_BMT_acc);
+  printf("m_cache_tot_meta_wb = %lld\n", m_cache_tot_meta_wb);
+
+}
+
 void gpgpu_sim::gpu_print_stat() {
   FILE *statfout = stdout;
 
@@ -1566,6 +1597,9 @@ void gpgpu_sim::gpu_print_stat() {
   gpu_print_METACache_stat("MAC");
   // BMT cache stats
   gpu_print_METACache_stat("BMT");
+  
+  // mf data type breakdown
+  gpu_print_METACache_data_type_breakdown();
 
   if (m_config.gpgpu_cflog_interval != 0) {
     spill_log_to_file(stdout, 1, gpu_sim_cycle);
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 56159c4d3..af1137858 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -571,6 +571,7 @@ class gpgpu_sim : public gpgpu_t {
 
   const gpgpu_sim_config &get_config() const { return m_config; }
   void gpu_print_METACache_stat(char META[]);
+  void gpu_print_METACache_data_type_breakdown();
   void gpu_print_stat();
   void dump_pipeline(int mask, int s, int m) const;
 
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 6b0b70d9b..40385bf04 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -123,6 +123,11 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
     m_sub_partition[p] =
         new memory_sub_partition(sub_partition_id, m_config, stats, gpu);
   }
+  m_cache_DEFAULT_acc = 0;
+  m_cache_CTR_acc = 0;
+  m_cache_MAC_acc = 0;
+  m_cache_BMT_acc = 0;
+  m_cache_meta_wb = 0;
 }
 
 void memory_partition_unit::handle_memcpy_to_gpu(
@@ -410,6 +415,17 @@ void memory_partition_unit::dram_cycle() {
     mem_fetch *mf = m_dram_latency_queue.front().req;
     m_dram_latency_queue.pop_front();
     m_dram->push(mf);
+
+    if (mf->get_access_type() == META_WRBK_ACC) 
+      m_cache_meta_wb++;
+    else if (mf->get_data_type() == DEFAULT) 
+      m_cache_DEFAULT_acc++;
+    else if (mf->get_data_type() == CTR)
+      m_cache_CTR_acc++;
+    else if (mf->get_data_type() == MAC)
+      m_cache_MAC_acc++;
+    else if (mf->get_data_type() >= BMT_L1)
+      m_cache_BMT_acc++;
     // if (mf->get_sub_partition_id() == 0)
       // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "to dram", mf_return->get_addr(), mf_return->get_sub_partition_id(), mf_return->get_partition_addr(), mf_return->get_access_type());
     
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 5bd0950de..ba39a14d5 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -146,6 +146,14 @@ class memory_partition_unit {
   class metainterface *m_metainterface;
   partition_mf_allocator *m_mf_allocator;
 
+ public:
+  unsigned long long m_cache_DEFAULT_acc;
+  unsigned long long m_cache_CTR_acc;
+  unsigned long long m_cache_MAC_acc;
+  unsigned long long m_cache_BMT_acc;
+  unsigned long long m_cache_meta_wb;
+
+ private:
   fifo_pipeline<mem_fetch> *m_mee_dram_queue; 
   fifo_pipeline<mem_fetch> *m_dram_mee_queue; 
 
@@ -321,7 +329,7 @@ class L2interface : public mem_fetch_interface {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
     m_unit->m_L2_mee_queue->push(mf);
     // if (mf->get_access_type() == 9)
-    //                     printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+                        // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 to mee:", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
     // printf("l2 to mee access type: %d\n",mf->get_access_type());
   }
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 6c2e3956a..b9ee881de 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -10,7 +10,7 @@ mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class me
     m_BMTcache(BMTcache),
     m_config(config),
     m_gpu(gpu) {
-    unsigned len = 16;
+    unsigned len = 64;
     m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
@@ -288,7 +288,7 @@ void mee::AES_cycle() {
                 // printf("IIIIIIIIIIIIIIII\n");
             }
         } else {
-            print_addr("waiting for AES:\t", mf);
+            // print_addr("waiting for AES:\t", mf);
             // if (mf->is_write()) 
             //     printf("%p %d AES waiting for OTP %d\n", mf, mf->get_sub_partition_id(), OTP_id);
         }
@@ -322,7 +322,7 @@ void mee::MAC_CHECK_cycle() {
             m_MAC_CHECK_queue->pop();
             // printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         } else {
-            print_addr("waiting for MAC Check:\t", mf);
+            // print_addr("waiting for MAC Check:\t", mf);
             // if (mf->get_sub_partition_id() == 32) 
                 // printf("%p %d MAC waiting for HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
         }
@@ -359,7 +359,7 @@ void mee::BMT_CHECK_cycle() {
         if (m_BMT_set[HASH_id] && ((m_config->m_META_config.m_cache_type == SECTOR && !m_BMT_queue->full(5)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_BMT_queue->full(2)))) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
             m_BMT_set[HASH_id]--;
             m_BMT_CHECK_queue->pop();
-            print_addr("BMT Hash:\t", mf);
+            // print_addr("BMT Hash:\t", mf);
             //计算下一层BMT
             if (mf->get_data_type() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
@@ -455,7 +455,7 @@ void mee::CTR_cycle() {
             m_CTR_queue->pop();
             if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
                 #ifdef BMT_Enable
-                print_addr("CTR Write:\t", mf);
+                // print_addr("CTR Write:\t", mf);
                 if (mf->get_id())
                     m_CTR_BMT_Buffer->push(mf);
                 if (mf->get_id())

From 6f5f2efb15f1fdc785e902c9a125fe8243fe9f7a Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sun, 29 Sep 2024 20:20:13 +0800
Subject: [PATCH 124/133] mee v1.3.0

---
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector_0crypto   | 251 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector_L2_4MB    | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 ...usim.config_base_mee_sector_large_mdc_64KB | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 .../accelwattch_ptx_sim.xml                   | 623 ++++++++++++++++++
 .../accelwattch_ptx_sim_alt.xml               | 623 ++++++++++++++++++
 .../accelwattch_sass_hw.xml                   | 613 +++++++++++++++++
 .../accelwattch_sass_hybrid.xml               | 613 +++++++++++++++++
 .../accelwattch_sass_sim.xml                  | 613 +++++++++++++++++
 .../accelwattch_sass_sim_alt.xml              | 613 +++++++++++++++++
 .../config_volta_islip.icnt                   |  74 +++
 .../gpgpusim.config_base_mee_sector           | 250 +++++++
 src/gpgpu-sim/gpu-sim.cc                      |   3 +
 src/gpgpu-sim/gpu-sim.h                       |   1 +
 src/gpgpu-sim/mee.cc                          |   4 +-
 107 files changed, 52293 insertions(+), 2 deletions(-)
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/gpgpusim.config_base_mee_sector_0crypto
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/gpgpusim.config_base_mee_sector_0crypto b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/gpgpusim.config_base_mee_sector_0crypto
new file mode 100644
index 000000000..6f5168f4d
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/gpgpusim.config_base_mee_sector_0crypto
@@ -0,0 +1,251 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_crypto_latency 0
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..7c57ba2f2
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..7c57ba2f2
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
new file mode 100644
index 000000000..daca760c7
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:16,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..7c57ba2f2
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..7c57ba2f2
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..aa433f7ff
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:8:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
new file mode 100644
index 000000000..e238c00c9
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:32:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..1d3198516
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:16:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..4503d3682
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:128:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..f48e9c9d7
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:32:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..7c57ba2f2
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..7c57ba2f2
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 4000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 1fedf0d54..046b4b939 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -217,6 +217,9 @@ void power_config::reg_options(class OptionParser *opp) {
 }
 
 void memory_config::reg_options(class OptionParser *opp) {
+  option_parser_register(opp, "-gpgpu_crypto_latency", OPT_INT32,
+                         &m_crypto_latency, "gpgpu secmem crypto latency",
+                         "40");
   option_parser_register(opp, "-gpgpu_perf_sim_memcpy", OPT_BOOL,
                          &m_perf_sim_memcpy, "Fill the L2 cache on memcpy",
                          "1");
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index af1137858..c389ee320 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -351,6 +351,7 @@ class memory_config {
   unsigned gpgpu_frfcfs_dram_write_queue_size;
   unsigned write_high_watermark;
   unsigned write_low_watermark;
+  unsigned m_crypto_latency;
   bool m_perf_sim_memcpy;
   bool simple_dram_model;
 
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index b9ee881de..2ff35924b 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -21,10 +21,10 @@ mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class me
     m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len + 100);
 
-    m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
+    m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", m_config->m_crypto_latency, m_config->m_crypto_latency + len);
     m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
-    m_HASH_queue = new fifo_pipeline<hash>("meta-queue", 40, 40 + len);
+    m_HASH_queue = new fifo_pipeline<hash>("meta-queue", m_config->m_crypto_latency, m_config->m_crypto_latency + len);
     m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
     // m_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);

From a6f99537a83f980ea02f478d5207c925c24d5ac3 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Mon, 7 Oct 2024 00:04:10 +0800
Subject: [PATCH 125/133] mee v1.3.1

---
 src/gpgpu-sim/gpu-sim.cc  |   2 +-
 src/gpgpu-sim/l2cache.cc  | 110 +++++++++++++++++--------
 src/gpgpu-sim/l2cache.h   |  52 +++++++++---
 src/gpgpu-sim/mee.cc      | 165 +++++++++++++++++++-------------------
 src/gpgpu-sim/mee.h       |   2 +-
 src/gpgpu-sim/mem_fetch.h |  18 ++++-
 6 files changed, 215 insertions(+), 134 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 046b4b939..b56dbcec7 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1392,7 +1392,7 @@ void gpgpu_sim::gpu_print_METACache_stat(char META[]) {
       printf("%s_total_cache_secondary_misses = %llu\n", META, l2_stats.m_stats[META_ACC][MSHR_HIT]);
       //secondary MISS rate
       if (total_l2_css.misses > 0)
-        printf("%s_total_cache_secondary_miss_rate = %.4lf\n", META, (double)l2_stats.m_stats[META_ACC][MSHR_HIT] / (double)total_l2_css.misses + (double)l2_stats.m_stats[META_ACC][MSHR_HIT]);
+        printf("%s_total_cache_secondary_miss_rate = %.4lf\n", META, (double)l2_stats.m_stats[META_ACC][MSHR_HIT] / ((double)total_l2_css.misses + (double)l2_stats.m_stats[META_ACC][MSHR_HIT]));
       printf("%s_total_cache_pending_hits = %llu\n", META, total_l2_css.pending_hits);
       printf("%s_total_cache_reservation_fails = %llu\n",
              META, total_l2_css.res_fails);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 40385bf04..23ff71bcc 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -82,38 +82,46 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
       m_gpu(gpu) {
   m_dram = new dram_t(m_id, m_config, m_stats, this, gpu);
 
+  unsigned int icnt_L2;
+  unsigned int L2_dram;
+  unsigned int dram_L2;
+  unsigned int L2_icnt;
+  sscanf(m_config->gpgpu_L2_queue_config, "%u:%u:%u:%u", &icnt_L2, &L2_dram,
+         &dram_L2, &L2_icnt);
+  
+  m_mee_dram_queue[TOT] = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, 1);
+  m_dram_mee_queue[TOT] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, 1);
+  for (unsigned i = 1; i < NUM_DATA_TYPE; i++) { 
+    m_mee_dram_queue[i] = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
+    m_dram_mee_queue[i] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, dram_L2);
+  }
+
   char CTRc_name[32];
   char MACc_name[32];
   char BMTc_name[32];
   snprintf(CTRc_name, 32, "CTR_bank_%03d\0", m_id);
   snprintf(MACc_name, 32, "MAC_bank_%03d\0", m_id);
   snprintf(BMTc_name, 32, "BMT_bank_%03d\0", m_id);
-  m_metainterface = new metainterface(this);
+  // m_metainterface = new metainterface(this);
+  m_BMTinterface = new metainterface(m_mee_dram_queue[BMT]);
+  m_CTRinterface = new metainterface(m_mee_dram_queue[CTR]);
+  m_MACinterface = new metainterface(m_mee_dram_queue[MAC]);
   m_mf_allocator = new partition_mf_allocator(config);
 
   if (!m_config->m_META_config.disabled()) {
     m_CTRcache =
-        new meta_cache(CTRc_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(CTRc_name, m_config->m_META_config, -1, -1, m_CTRinterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
     m_MACcache =
-        new meta_cache(MACc_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(MACc_name, m_config->m_META_config, -1, -1, m_MACinterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
     m_BMTcache =
-        new meta_cache(BMTc_name, m_config->m_META_config, -1, -1, m_metainterface,
+        new meta_cache(BMTc_name, m_config->m_META_config, -1, -1, m_BMTinterface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
   }
-
-  unsigned int icnt_L2;
-  unsigned int L2_dram;
-  unsigned int dram_L2;
-  unsigned int L2_icnt;
-  sscanf(m_config->gpgpu_L2_queue_config, "%u:%u:%u:%u", &icnt_L2, &L2_dram,
-         &dram_L2, &L2_icnt);
-  m_mee_dram_queue = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
-  m_dram_mee_queue = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, dram_L2);
-
-  m_mee = new mee(this, m_CTRcache, m_MACcache, m_BMTcache, m_config, m_gpu);
   
+  m_mee = new mee(this, m_CTRcache, m_MACcache, m_BMTcache, m_config, m_gpu);
+
   m_sub_partition = new memory_sub_partition
       *[m_config->m_n_sub_partition_per_memory_channel];
   for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
@@ -146,7 +154,8 @@ void memory_partition_unit::handle_memcpy_to_gpu(
 memory_partition_unit::~memory_partition_unit() {
   delete m_dram;
   delete m_CTRcache;
-  delete m_metainterface;
+  // delete m_metainterface;
+  delete m_BMTinterface;
   delete m_mee;
   for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
        p++) {
@@ -273,6 +282,40 @@ int memory_partition_unit::global_sub_partition_id_to_local_id(
           m_id * m_config->m_n_sub_partition_per_memory_channel);
 }
 
+void memory_partition_unit::mee_to_dram_cycle() {
+  // mee to dram 队列满了就停止发送
+  if (m_mee_dram_queue[TOT]->full()) return;
+  //发送队列高于阈值优先发送
+  for (unsigned i = 1; i < NUM_DATA_TYPE; i++) { 
+    unsigned dtype = i;
+    if (m_mee_dram_queue[dtype]->get_n_element() >= send_trigger_threshold) {
+      m_mee_dram_queue[TOT]->push(m_mee_dram_queue[dtype]->top());
+      m_mee_dram_queue[dtype]->pop();
+      return;
+    }
+  }
+  //返回队列高于阈值停止发送
+  for (unsigned i = 0; i < NUM_DATA_TYPE; i++) {
+    unsigned dtype = (i + last_send + 1) % NUM_DATA_TYPE;
+    if (dtype == 0) continue;
+    if (m_mee_dram_queue[dtype]->empty()) continue;
+    if (m_dram_mee_queue[dtype]->get_n_element() >= receive_stop_threshold) continue;
+    m_mee_dram_queue[TOT]->push(m_mee_dram_queue[dtype]->top());
+    m_mee_dram_queue[dtype]->pop();
+    last_send = dtype;
+    return;
+  }
+}
+
+void memory_partition_unit::dram_to_mee_cycle() {
+  if (m_dram_mee_queue[TOT]->empty()) return;
+  mem_fetch *mf_return = m_dram_mee_queue[TOT]->top();
+  if (!m_dram_mee_queue[mf_return->get_data_type()]->full()) {
+    m_dram_mee_queue[mf_return->get_data_type()]->push(mf_return);
+    m_dram_mee_queue[TOT]->pop();
+  }
+}
+
 void memory_partition_unit::simple_dram_model_cycle() {
   // pop completed memory request from dram and push it to dram-to-L2 queue
   // of the original sub partition
@@ -372,6 +415,9 @@ void memory_partition_unit::dram_cycle() {
 
   m_dram->cycle();
   m_dram->dram_log(SAMPLELOG);
+  
+  mee_to_dram_cycle();
+  dram_to_mee_cycle();
 
   // mem_fetch *mf = m_sub_partition[spid]->L2_dram_queue_top();
   // if( !m_dram->full(mf->is_write()) ) {
@@ -742,48 +788,48 @@ void memory_partition_unit::L2_mee_queue_pop(unsigned spid) { m_sub_partition[sp
 // interface to mee_dram_queue
 
 bool memory_partition_unit::mee_dram_queue_empty() const {
-  return m_mee_dram_queue->empty(); // TODO
+  return m_mee_dram_queue[TOT]->empty(); // TODO
 }
 
 class mem_fetch *memory_partition_unit::mee_dram_queue_top() const {
-  return m_mee_dram_queue->top(); // TODO
+  return m_mee_dram_queue[TOT]->top(); // TODO
 }
 
-void memory_partition_unit::mee_dram_queue_pop() { m_mee_dram_queue->pop(); } // TODO
+void memory_partition_unit::mee_dram_queue_pop() { m_mee_dram_queue[TOT]->pop(); } // TODO
 
-bool memory_partition_unit::mee_dram_queue_full() const {
-  return m_mee_dram_queue->full(); //TODO
+bool memory_partition_unit::mee_dram_queue_full(enum data_type dtype) const {
+  return m_mee_dram_queue[dtype]->full(); //TODO
 }
 
-bool memory_partition_unit::mee_dram_queue_full(int size) const {
-  return m_mee_dram_queue->full(size); //TODO
+bool memory_partition_unit::mee_dram_queue_full(int size, enum data_type dtype) const {
+  return m_mee_dram_queue[dtype]->full(size); //TODO
 }
 
-void memory_partition_unit::mee_dram_queue_push(class mem_fetch *mf) {
+void memory_partition_unit::mee_dram_queue_push(class mem_fetch *mf, enum data_type dtype) {
   if (get_mpid() == 0) {
         // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %lld\n", "mee to dram push:\t", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
     }
-  m_mee_dram_queue->push(mf); //TODO
+  m_mee_dram_queue[dtype]->push(mf); //TODO
 }
 
 // interface to dram_mee_queue
 
-bool memory_partition_unit::dram_mee_queue_empty() const {
-  return m_dram_mee_queue->empty(); // TODO
+bool memory_partition_unit::dram_mee_queue_empty(enum data_type dtype) const {
+  return m_dram_mee_queue[dtype]->empty(); // TODO
 }
 
-class mem_fetch *memory_partition_unit::dram_mee_queue_top() const {
-  return m_dram_mee_queue->top(); // TODO
+class mem_fetch *memory_partition_unit::dram_mee_queue_top(enum data_type dtype) const {
+  return m_dram_mee_queue[dtype]->top(); // TODO
 }
 
-void memory_partition_unit::dram_mee_queue_pop() { m_dram_mee_queue->pop(); } // TODO
+void memory_partition_unit::dram_mee_queue_pop(enum data_type dtype) { m_dram_mee_queue[dtype]->pop(); } // TODO
 
 bool memory_partition_unit::dram_mee_queue_full() const {
-  return m_dram_mee_queue->full(); //TODO
+  return m_dram_mee_queue[TOT]->full(); //TODO
 }
 
 void memory_partition_unit::dram_mee_queue_push(class mem_fetch *mf) {
-  m_dram_mee_queue->push(mf); //TODO
+  m_dram_mee_queue[TOT]->push(mf); //TODO
 }
 
 // interface to mee_L2_queue
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index ba39a14d5..2036778cc 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -77,6 +77,8 @@ class memory_partition_unit {
 
   bool busy() const;
 
+  void dram_to_mee_cycle();
+  void mee_to_dram_cycle();
   void cache_cycle(unsigned cycle);
   void dram_cycle();
   void simple_dram_model_cycle();
@@ -117,13 +119,13 @@ class memory_partition_unit {
   bool mee_dram_queue_empty() const;
   class mem_fetch *mee_dram_queue_top() const;
   void mee_dram_queue_pop();
-  bool mee_dram_queue_full() const;
-  bool mee_dram_queue_full(int size) const;
-  void mee_dram_queue_push(class mem_fetch *mf);
+  bool mee_dram_queue_full(enum data_type dtype) const;
+  bool mee_dram_queue_full(int size, enum data_type dtype) const;
+  void mee_dram_queue_push(class mem_fetch *mf, enum data_type dtype);
 
-  bool dram_mee_queue_empty() const;
-  class mem_fetch *dram_mee_queue_top() const;
-  void dram_mee_queue_pop();
+  bool dram_mee_queue_empty(enum data_type dtype) const;
+  class mem_fetch *dram_mee_queue_top(enum data_type dtype) const;
+  void dram_mee_queue_pop(enum data_type dtype);
   bool dram_mee_queue_full() const;
   void dram_mee_queue_push(class mem_fetch *mf);
 
@@ -143,7 +145,10 @@ class memory_partition_unit {
   class meta_cache *m_MACcache;
   class meta_cache *m_BMTcache;
   class mee *m_mee;
-  class metainterface *m_metainterface;
+  // class metainterface *m_metainterface;
+  class metainterface *m_BMTinterface;
+  class metainterface *m_CTRinterface;
+  class metainterface *m_MACinterface;
   partition_mf_allocator *m_mf_allocator;
 
  public:
@@ -154,8 +159,20 @@ class memory_partition_unit {
   unsigned long long m_cache_meta_wb;
 
  private:
-  fifo_pipeline<mem_fetch> *m_mee_dram_queue; 
-  fifo_pipeline<mem_fetch> *m_dram_mee_queue; 
+  fifo_pipeline<mem_fetch> *m_mee_dram_queue[5]; 
+  fifo_pipeline<mem_fetch> *m_dram_mee_queue[5]; 
+  const unsigned send_trigger_threshold = 16;
+  const unsigned receive_stop_threshold = 16;
+  unsigned last_send = 0;
+  // fifo_pipeline<mem_fetch> *m_NORM_dram_queue; 
+  // fifo_pipeline<mem_fetch> *m_CTR_dram_queue; 
+  // fifo_pipeline<mem_fetch> *m_MAC_dram_queue; 
+  // fifo_pipeline<mem_fetch> *m_BMT_dram_queue;
+
+  // fifo_pipeline<mem_fetch> *m_dram_NORM_queue;
+  // fifo_pipeline<mem_fetch> *m_dram_CTR_queue;
+  // fifo_pipeline<mem_fetch> *m_dram_MAC_queue;
+  // fifo_pipeline<mem_fetch> *m_dram_BMT_queue; 
 
   class arbitration_metadata {
    public:
@@ -340,21 +357,30 @@ class L2interface : public mem_fetch_interface {
 
 class metainterface : public mem_fetch_interface {
  public:
-  metainterface(memory_partition_unit *unit) { m_unit = unit; }
+  // metainterface(memory_partition_unit *unit, enum cache_type dtype) { 
+  metainterface(fifo_pipeline<mem_fetch> *pipeline) { 
+    // m_unit = unit;
+    // m_dtype = dtype;
+    this->pipeline = pipeline;
+  }
   virtual ~metainterface() {}
   virtual bool full(unsigned size, bool write) const {
     // assume read and write packets all same size
-    return m_unit->mee_dram_queue_full();
+    // return m_unit->mee_dram_queue_full();
+    return pipeline->full();
   }
   virtual void push(mem_fetch *mf) {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
-                            // printf("%saddr: %x\tmf_type: %d\tsp_addr: %x\taccess type:%d\n", "mee to dram:\t", mf->get_addr(), mf->get_data_type(), mf->get_partition_addr(), mf->get_access_type());
+    // printf("%saddr: %x\tmf_type: %d\tsp_addr: %x\taccess type:%d\n", "mee to dram:\t", mf->get_addr(), mf->get_data_type(), mf->get_partition_addr(), mf->get_access_type());
 
-    m_unit->mee_dram_queue_push(mf);
+    // m_unit->mee_dram_queue_push(mf);
+    pipeline->push(mf);
   }
 
  private:
   memory_partition_unit *m_unit;
+  enum cache_type m_dtype;
+  fifo_pipeline<mem_fetch> *pipeline;
 };
 
 #endif
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 2ff35924b..f2cb8ce29 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -107,7 +107,7 @@ void mee::gen_CTR_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned
 
     meta_access(m_CTR_queue, CTR_addr, meta_acc, 
             size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, CTR);
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, CTR, DEFAULT);
 }
 
 void mee::gen_MAC_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id) {
@@ -122,7 +122,7 @@ void mee::gen_MAC_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned
 
     meta_access(m_MAC_queue, MAC_addr, meta_acc, 
             size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, MAC);
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, MAC, DEFAULT);
 }
 
 void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id) {
@@ -139,17 +139,17 @@ void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned
     new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
     BMT_addr |= 0xF2000000;
 
-    enum data_type BMT_type = static_cast<data_type>(mf->get_data_type() + 1);
+    enum BMT_Layer BMT_type = static_cast<BMT_Layer>(mf->get_BMT_Layer() + 1);
 
     meta_access(m_BMT_queue, BMT_addr, meta_acc, 
             size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, BMT_type);
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, BMT, BMT_type);
 }
 
 void mee::meta_access(
         fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr,
         unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
-        mem_fetch *original_mf, unsigned mf_id, enum data_type m_data_type) const {
+        mem_fetch *original_mf, unsigned mf_id, enum data_type m_data_type, enum BMT_Layer m_Layer) const {
 
     mem_access_byte_mask_t byte_mask;
     mem_access_sector_mask_t sector_mask;
@@ -186,6 +186,7 @@ void mee::meta_access(
         mem_fetch *req = reqs[i];
         // req->set_id(mf_id);
         req->set_data_type(m_data_type);
+        req->set_BMT_Layer(m_Layer);
         if (i == reqs.size() - 1)
             req->set_id(mf_id);
         else
@@ -222,7 +223,7 @@ void mee::CT_cycle() {
         }
     }
 
-    if (!m_Ciphertext_queue->empty() && CT_counter < OTP_counter) {
+    if (!m_Ciphertext_queue->empty()) {
         mem_fetch *mf = m_Ciphertext_queue->top();
         // print_addr("L2 to mee:\t", mf);
         if (mf->is_write()) { // write
@@ -244,8 +245,8 @@ void mee::CT_cycle() {
                     // printf("SSSSSSSSSSSSSSSSSSS");
                 }
             }
-        } else if (!m_unit->mee_dram_queue_full()) {              // read
-            m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
+        } else if (!m_unit->mee_dram_queue_full(NORM)) {              // read
+            m_unit->mee_dram_queue_push(mf, NORM);    //读密文请求，发往DRAM中读密文
             m_Ciphertext_queue->pop();
             CT_counter++;
         }
@@ -268,9 +269,9 @@ void mee::AES_cycle() {
             if (mf->is_write()) {   //加密
             // assert(!mf->is_write());
                 // printf("OOOOOOOOOOOOOOOOOOOOOO\n");
-                if (!m_unit->mee_dram_queue_full() && !m_HASH_queue->full()) {
+                if (!m_unit->mee_dram_queue_full(NORM) && !m_HASH_queue->full()) {
                     m_OTP_set[OTP_id]--;
-                    m_unit->mee_dram_queue_push(mf);    //加密完后更新DRAM中的密文
+                    m_unit->mee_dram_queue_push(mf, NORM);    //加密完后更新DRAM中的密文
                     CT_counter++;
                     m_HASH_queue->push(new hash(MAC, mf->get_id()));          //加密完后得到密文，对密文进行MAC Hash
                     m_AES_queue->pop();
@@ -361,7 +362,7 @@ void mee::BMT_CHECK_cycle() {
             m_BMT_CHECK_queue->pop();
             // print_addr("BMT Hash:\t", mf);
             //计算下一层BMT
-            if (mf->get_data_type() == BMT_L4) {
+            if (mf->get_BMT_Layer() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
                 if (mf->get_id())
@@ -401,7 +402,7 @@ void mee::BMT_CHECK_cycle() {
     // }
 
     // CTR to BMT
-    if (!m_CTR_BMT_Buffer->empty() && !m_BMT_CHECK_queue->full() && !m_HASH_queue->full() && !BMT_busy) {
+    if (!m_CTR_BMT_Buffer->empty() && !m_BMT_CHECK_queue->full() && !m_HASH_queue->full()) {
         // assert(cnt);
         mem_fetch *mf = m_CTR_BMT_Buffer->top();
             // gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, mf->get_id());
@@ -437,7 +438,7 @@ void mee::CTR_cycle() {
     bool output_full = m_OTP_queue->full() || m_CTR_RET_queue->full() || m_CTR_BMT_Buffer->full();
     bool port_free = m_unit->m_CTRcache->data_port_free();
 
-    if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free && CTR_counter <= BMT_counter) {
+    if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full(CTR) && !output_full && port_free) {
         mem_fetch *mf = m_CTR_queue->top();
         // print_addr("CTR cycle access:\t\t", mf);
 
@@ -511,7 +512,7 @@ void mee::MAC_cycle() {
     bool output_full = m_MAC_CHECK_queue->full() || m_MAC_RET_queue->full();// && 
     bool port_free = m_unit->m_MACcache->data_port_free();
     
-    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free && MAC_counter < CT_counter) {
+    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full(MAC) && !output_full && port_free) {
         mem_fetch *mf = m_MAC_queue->top();
         // print_addr("MAC cycle access:\t\t", mf);
 
@@ -581,7 +582,7 @@ void mee::BMT_cycle() {
         // assert(mf->get_access_type() == META_RBW);
     }
 
-    if (!m_BMT_queue->empty() && !m_unit->mee_dram_queue_full() && !output_full && port_free) {
+    if (!m_BMT_queue->empty() && !m_unit->mee_dram_queue_full(BMT) && !output_full && port_free) {
         mem_fetch *mf = m_BMT_queue->top();
         print_addr("BMT waiting access:\t", mf);
         // assert(mf->get_access_type() == mf->get_access_type());
@@ -636,32 +637,36 @@ void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_f
 void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type) {
     // if (m_METAcache == m_BMTcache) printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE);
     
-    if ((mf->get_data_type() == m_data_type) && m_METAcache->waiting_for_fill(mf)) {
-        // print_addr("wating for fill:\t\t", mf); 
-        if (m_METAcache->fill_port_free()) {
-            // assert(mf->get_access_type() != META_WR_ALLOC_R);
-            m_METAcache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
-                                    m_memcpy_cycle_offset);
-            // if (m_data_type == MAC)
-            //     print_addr("MAC fill:\t", mf);
-            assert(!mf->is_write());
-            // if (m_METAcache == m_BMTcache)
-            //     print_addr("fill:\t\t\t\t", mf);
-                // printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE); 
-            // if (mf->get_sub_partition_id() == 1) { 
-            //     printf("CTR Fill: %p\n", mf);
-            //     // printf("CTR Next: %p\n", m_CTR_queue->top());
-            // }
-            m_unit->dram_mee_queue_pop();
-        } else {
-            print_addr("fill ERROR:\t", mf);
+    if (!m_unit->dram_mee_queue_empty(m_data_type)) {
+        mem_fetch *mf_return = m_unit->dram_mee_queue_top(m_data_type);
+        print_addr("fill: \t", mf_return);
+        if ((mf_return->get_data_type() == m_data_type) && m_METAcache->waiting_for_fill(mf_return)) {
+            // print_addr("wating for fill:\t\t", mf); 
+            if (m_METAcache->fill_port_free()) {
+                // assert(mf->get_access_type() != META_WR_ALLOC_R);
+                m_METAcache->fill(mf_return, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
+                                        m_memcpy_cycle_offset);
+                // if (m_data_type == MAC)
+                //     print_addr("MAC fill:\t", mf);
+                assert(!mf_return->is_write());
+                // if (m_METAcache == m_BMTcache)
+                //     print_addr("fill:\t\t\t\t", mf);
+                    // printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE); 
+                // if (mf->get_sub_partition_id() == 1) { 
+                //     printf("CTR Fill: %p\n", mf);
+                //     // printf("CTR Next: %p\n", m_CTR_queue->top());
+                // }
+                m_unit->dram_mee_queue_pop(m_data_type);
+            } else {
+                print_addr("fill ERROR:\t", mf_return);
+            }
+        } else if (mf_return->get_data_type() == m_data_type) {
+            if (mf_return->is_write() && mf_return->get_type() == WRITE_ACK)
+                mf_return->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
+                            m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+            //   m_META_RET_queue->push(mf);
+            m_unit->dram_mee_queue_pop(m_data_type);
         }
-    } else if (mf->get_data_type() == m_data_type) {
-      if (mf->is_write() && mf->get_type() == WRITE_ACK)
-        mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
-                       m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-    //   m_META_RET_queue->push(mf);
-      m_unit->dram_mee_queue_pop();
     }
 }
 
@@ -679,10 +684,13 @@ void mee::simple_cycle(unsigned cycle) {
     META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[1]);
     // }
     // META_fill_responses(m_BMTcache);
+    META_fill(m_CTRcache, m_CTR_RET_queue, NULL, CTR_mask, CTR_base, CTR);
+    META_fill(m_MACcache, m_MAC_RET_queue, NULL, MAC_mask, MAC_base, MAC);
+    META_fill(m_BMTcache, m_BMT_RET_queue, NULL, BMT_mask[1], BMT_base[1], BMT);
 
     // dram to mee
-    if (!m_unit->dram_mee_queue_empty()) {
-        mem_fetch *mf_return = m_unit->dram_mee_queue_top();
+    if (!m_unit->dram_mee_queue_empty(NORM)) {
+        mem_fetch *mf_return = m_unit->dram_mee_queue_top(NORM);
         // assert(!mf_return->is_write());
         // if (mf_return->get_sub_partition_id() == 58)
         print_addr("waiting for fill:\t", mf_return);
@@ -695,7 +703,7 @@ void mee::simple_cycle(unsigned cycle) {
             // mf_return->get_access_type() == L2_WRBK_ACC
             ) {
                 assert(mf_return->get_access_type() == 4 && !mf_return->is_write());
-            m_unit->dram_mee_queue_pop();
+            m_unit->dram_mee_queue_pop(NORM);
         } else {
         
             // print_addr("dram_mee_queue_top:\t", mf_return);
@@ -704,38 +712,21 @@ void mee::simple_cycle(unsigned cycle) {
             // META_fill(m_MACcache, mf_return, MAC_mask);
             // META_fill(m_BMTcache, mf_return);
             // if (!m_unit->mee_L2_queue_full()) {
-
-            if (mf_return->get_access_type() >= META_ACC) { // META访存的返回，需要响应
-                // printf("Success handle CTR_ACC: ");
-                // print_addr("META return to mee", mf_return);
-                // delete mf_return;
-                META_fill(m_CTRcache, m_CTR_RET_queue, mf_return, CTR_mask, CTR_base, CTR);
-                META_fill(m_MACcache, m_MAC_RET_queue, mf_return, MAC_mask, MAC_base, MAC);
-                // for (int layer = 1; layer <= 4; layer++) {
-                    META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT_L1);
-                    META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT_L2);
-                    META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT_L3);
-                    META_fill(m_BMTcache, m_BMT_RET_queue, mf_return, BMT_mask[1], BMT_base[1], BMT_L4);
-                // }
-            } else {    // 密文访存返回
-                // assert(mf_return->get_access_type() != 4);
-                // reply L2 read
-                // reply L2 write back
-                //m_unit->mee_L2_queue_push(m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id()), mf_return);
-                int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
-                assert(mf_return->get_access_type() < META_ACC);
-                if (!m_Ciphertext_RET_queue->full()) {              
-                    // m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
-                    // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
-                    // assert(m_MAC_table[(new_addr_type)mf_return]);
-                    // m_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
-                    m_Ciphertext_RET_queue->push(mf_return);
-                    m_unit->dram_mee_queue_pop();
-                    // printf("HHHHHHHHHHHHHHHH");
-                } else {
-                    // printf("HHHHHHHHHHHHHHHH");
-                }
-                // print_addr("mee to L2: ", mf_return);
+            // reply L2 read
+            // reply L2 write back
+            //m_unit->mee_L2_queue_push(m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id()), mf_return);
+            int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+            assert(mf_return->get_access_type() < META_ACC);
+            if (!m_Ciphertext_RET_queue->full()) {              
+                // m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
+                // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
+                // assert(m_MAC_table[(new_addr_type)mf_return]);
+                // m_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
+                m_Ciphertext_RET_queue->push(mf_return);
+                m_unit->dram_mee_queue_pop(NORM);
+                // printf("HHHHHHHHHHHHHHHH");
+            } else {
+                // printf("HHHHHHHHHHHHHHHH");
             }
         }
     } else if (!m_unit->mee_dram_queue_empty()) {
@@ -793,7 +784,7 @@ void mee::simple_cycle(unsigned cycle) {
                 // mf->set_cooked_status();
                 // printf("BBBBBBBBBBBBBBBBB");
                 // }
-            } else if (!m_unit->mee_dram_queue_full()) {              // read
+            } else if (!m_unit->mee_dram_queue_full(NORM)) {              // read
                 // printf("CCCCCCCCCCCCCCCC");
                 // m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
                 mf_counter++;
@@ -844,8 +835,8 @@ void mee::simple_cycle(unsigned cycle) {
 }
 
 void mee::cycle(unsigned cycle) {
-    if (!m_unit->dram_mee_queue_empty()) {
-        mem_fetch *mf_return = m_unit->dram_mee_queue_top();
+    if (!m_unit->dram_mee_queue_empty(NORM)) {
+        mem_fetch *mf_return = m_unit->dram_mee_queue_top(NORM);
         int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
          if (false
             // mf_return->get_is_write() ||
@@ -855,21 +846,21 @@ void mee::cycle(unsigned cycle) {
             // mf_return->get_access_type() == L2_WRBK_ACC
             ) {
                 // assert(mf_return->get_access_type() == 4 && !mf_return->is_write());
-            m_unit->dram_mee_queue_pop();
+            m_unit->dram_mee_queue_pop(NORM);
         } else {
             if (!m_unit->mee_L2_queue_full(spid)) { 
                 // m_OTP_table[REQ_addr] = 0;
                 // print_addr("mee to L2 R:\t", mf);
                 m_unit->mee_L2_queue_push(spid, mf_return);
-                m_unit->dram_mee_queue_pop();
+                m_unit->dram_mee_queue_pop(NORM);
                 
             }
         }
     }
     if (!m_unit->L2_mee_queue_empty(cycle&1)) {
         mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
-        if (!m_unit->mee_dram_queue_full()) {              
-            m_unit->mee_dram_queue_push(mf);
+        if (!m_unit->mee_dram_queue_full(NORM)) {              
+            m_unit->mee_dram_queue_push(mf, NORM);
             m_unit->L2_mee_queue_pop(cycle&1);
         }
     }
@@ -894,4 +885,12 @@ void mee::cycle(unsigned cycle) {
 //Sector
 //deepbench
 //可配置
-//lazy_fetch_on_read
\ No newline at end of file
+//lazy_fetch_on_read
+
+//mee<-->dram queue
+//write back
+//BMT_Layer
+
+//CTR_counter <= BMT_counter 
+//CT_counter  < OTP_counter
+//MAC_counter < CT_counter
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index b18e118cf..44d11dd52 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -19,7 +19,7 @@ class mee {
         void print_tag();
         void meta_access(fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, 
             unsigned size, bool wr, unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc, 
-            mem_fetch *original_mf, unsigned mf_id, enum data_type m_data_type) const;
+            mem_fetch *original_mf, unsigned mf_id, enum data_type m_data_type, enum BMT_Layer m_Layer) const;
         void CTR_cycle();
         void MAC_cycle();
         void BMT_cycle();
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index c47cc7430..55c01f4f5 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -34,10 +34,16 @@
 #include "addrdec.h"
 
 enum data_type {
-  DEFAULT = 0,
-  MAC,
+  TOT = 0,
   BMT,
   CTR,
+  NORM,
+  MAC,
+  NUM_DATA_TYPE
+};
+
+enum BMT_Layer {
+  DEFAULT = 0,
   BMT_L1,
   BMT_L2,
   BMT_L3,
@@ -147,7 +153,10 @@ class mem_fetch {
   void set_id(unsigned id) { this->id = id; }
 
   enum data_type get_data_type() { return this->m_data_type; }
-  void set_data_type(enum data_type m_data_type) { this->m_data_type = m_data_type; }
+  void set_data_type(enum data_type dtype) { this->m_data_type = dtype; }
+
+  enum BMT_Layer get_BMT_Layer() { return this->m_BMT_Layer; }
+  void set_BMT_Layer(enum BMT_Layer Layer) { this->m_BMT_Layer = Layer; }
 
  private:
   // request source information
@@ -197,7 +206,8 @@ class mem_fetch {
                               // when fetch-on-write policy is used
   bool raw_data = true;
   unsigned id;
-  enum data_type m_data_type = DEFAULT;
+  enum data_type m_data_type = NORM;
+  enum BMT_Layer m_BMT_Layer = DEFAULT;
 };
 
 #endif

From 0241f19d57b65fdf122aeeadb9d03bb29a5ca985 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Mon, 7 Oct 2024 13:07:05 +0800
Subject: [PATCH 126/133] mee v1.3.2

---
 src/gpgpu-sim/gpu-sim.cc |  6 +++---
 src/gpgpu-sim/l2cache.cc |  8 ++++----
 src/gpgpu-sim/l2cache.h  |  2 +-
 src/gpgpu-sim/mee.cc     | 22 +++++++++++-----------
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index b56dbcec7..f3554dd47 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1422,21 +1422,21 @@ void gpgpu_sim::gpu_print_METACache_data_type_breakdown() {
 
   printf("\n========= meta cache data type breakdown =========\n");
   
-  unsigned long long m_cache_tot_DEFAULT_acc = 0;
+  unsigned long long m_cache_tot_NORM_acc = 0;
   unsigned long long m_cache_tot_CTR_acc = 0;
   unsigned long long m_cache_tot_MAC_acc = 0;
   unsigned long long m_cache_tot_BMT_acc = 0;
   unsigned long long m_cache_tot_meta_wb = 0;
   
   for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
-    m_cache_tot_DEFAULT_acc += m_memory_partition_unit[i]->m_cache_DEFAULT_acc;
+    m_cache_tot_NORM_acc += m_memory_partition_unit[i]->m_cache_NORM_acc;
     m_cache_tot_CTR_acc += m_memory_partition_unit[i]->m_cache_CTR_acc;
     m_cache_tot_MAC_acc += m_memory_partition_unit[i]->m_cache_MAC_acc;
     m_cache_tot_BMT_acc += m_memory_partition_unit[i]->m_cache_BMT_acc;
     m_cache_tot_meta_wb += m_memory_partition_unit[i]->m_cache_meta_wb;
   }
 
-  printf("m_cache_tot_DEFAULT_acc = %lld\n", m_cache_tot_DEFAULT_acc);
+  printf("m_cache_tot_NORM_acc = %lld\n", m_cache_tot_NORM_acc);
   printf("m_cache_tot_CTR_acc = %lld\n", m_cache_tot_CTR_acc);
   printf("m_cache_tot_MAC_acc = %lld\n", m_cache_tot_MAC_acc);
   printf("m_cache_tot_BMT_acc = %lld\n", m_cache_tot_BMT_acc);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 23ff71bcc..fe5aa985d 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -131,7 +131,7 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
     m_sub_partition[p] =
         new memory_sub_partition(sub_partition_id, m_config, stats, gpu);
   }
-  m_cache_DEFAULT_acc = 0;
+  m_cache_NORM_acc = 0;
   m_cache_CTR_acc = 0;
   m_cache_MAC_acc = 0;
   m_cache_BMT_acc = 0;
@@ -464,13 +464,13 @@ void memory_partition_unit::dram_cycle() {
 
     if (mf->get_access_type() == META_WRBK_ACC) 
       m_cache_meta_wb++;
-    else if (mf->get_data_type() == DEFAULT) 
-      m_cache_DEFAULT_acc++;
+    else if (mf->get_data_type() == NORM) 
+      m_cache_NORM_acc++;
     else if (mf->get_data_type() == CTR)
       m_cache_CTR_acc++;
     else if (mf->get_data_type() == MAC)
       m_cache_MAC_acc++;
-    else if (mf->get_data_type() >= BMT_L1)
+    else if (mf->get_data_type() == BMT)
       m_cache_BMT_acc++;
     // if (mf->get_sub_partition_id() == 0)
       // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "to dram", mf_return->get_addr(), mf_return->get_sub_partition_id(), mf_return->get_partition_addr(), mf_return->get_access_type());
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 2036778cc..3dfb70227 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -152,7 +152,7 @@ class memory_partition_unit {
   partition_mf_allocator *m_mf_allocator;
 
  public:
-  unsigned long long m_cache_DEFAULT_acc;
+  unsigned long long m_cache_NORM_acc;
   unsigned long long m_cache_CTR_acc;
   unsigned long long m_cache_MAC_acc;
   unsigned long long m_cache_BMT_acc;
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index f2cb8ce29..40b72e9bb 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -38,7 +38,7 @@ int decode(int addr) {
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
     if (m_unit->get_mpid() == 12) {
-        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+        // printf("%saddr: %x\twr: %d\tdata_type: %d\tBMT_Layer: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_BMT_Layer(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
     }
 }
 
@@ -360,7 +360,7 @@ void mee::BMT_CHECK_cycle() {
         if (m_BMT_set[HASH_id] && ((m_config->m_META_config.m_cache_type == SECTOR && !m_BMT_queue->full(5)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_BMT_queue->full(2)))) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
             m_BMT_set[HASH_id]--;
             m_BMT_CHECK_queue->pop();
-            // print_addr("BMT Hash:\t", mf);
+            print_addr("BMT Hash:\t", mf);
             //计算下一层BMT
             if (mf->get_BMT_Layer() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
@@ -584,7 +584,7 @@ void mee::BMT_cycle() {
 
     if (!m_BMT_queue->empty() && !m_unit->mee_dram_queue_full(BMT) && !output_full && port_free) {
         mem_fetch *mf = m_BMT_queue->top();
-        print_addr("BMT waiting access:\t", mf);
+        // print_addr("BMT waiting access:\t", mf);
         // assert(mf->get_access_type() == mf->get_access_type());
 
         // if (mf->get_access_type() == META_RBW) {
@@ -600,17 +600,17 @@ void mee::BMT_cycle() {
         bool read_sent = was_read_sent(events);
         // print_addr("CTR cycle access:\t\t", mf);
         if (status == HIT) {
-            print_addr("BMT access HIT:\t", mf);
+            // print_addr("BMT access HIT:\t", mf);
             if (mf->get_id() && !mf->is_write()) {
                 m_BMT_CHECK_queue->push(mf);
                 m_HASH_queue->push(new hash(BMT, mf->get_id()));
             }
             m_BMT_queue->pop();
         } else if (status != RESERVATION_FAIL) {
-            print_addr("BMT access MISS:\t", mf);
+            // print_addr("BMT access MISS:\t", mf);
             m_BMT_queue->pop();
         } else {
-            print_addr("BMT access reservation_fail:\t", mf);
+            // print_addr("BMT access reservation_fail:\t", mf);
             assert(!write_sent);
             assert(!read_sent);
         }
@@ -624,7 +624,7 @@ void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_f
             m_META_RET_queue->push(mf);
         // assert(mf->get_access_type() == META_ACC);
         // if (m_METAcache == m_BMTcache)
-        print_addr("fill responses:\t", mf);
+        // print_addr("fill responses:\t", mf);
         // reply(m_METAcache, mf);
         // delete mf;
     } else {
@@ -639,7 +639,7 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
     
     if (!m_unit->dram_mee_queue_empty(m_data_type)) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top(m_data_type);
-        print_addr("fill: \t", mf_return);
+        // print_addr("fill: \t", mf_return);
         if ((mf_return->get_data_type() == m_data_type) && m_METAcache->waiting_for_fill(mf_return)) {
             // print_addr("wating for fill:\t\t", mf); 
             if (m_METAcache->fill_port_free()) {
@@ -658,7 +658,7 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
                 // }
                 m_unit->dram_mee_queue_pop(m_data_type);
             } else {
-                print_addr("fill ERROR:\t", mf_return);
+                // print_addr("fill ERROR:\t", mf_return);
             }
         } else if (mf_return->get_data_type() == m_data_type) {
             if (mf_return->is_write() && mf_return->get_type() == WRITE_ACK)
@@ -693,7 +693,7 @@ void mee::simple_cycle(unsigned cycle) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top(NORM);
         // assert(!mf_return->is_write());
         // if (mf_return->get_sub_partition_id() == 58)
-        print_addr("waiting for fill:\t", mf_return);
+        // print_addr("waiting for fill:\t", mf_return);
         // printf("%saddr: %x\tdata_type: %d\tsp_addr: %x\taccess type:%d\n", "fill queue:\t", mf->get_addr(), mf->get_data_type(), mf->get_partition_addr(), mf->get_access_type());
 
         if (false
@@ -740,7 +740,7 @@ void mee::simple_cycle(unsigned cycle) {
         // if (mf->get_access_type() == 9)
                         // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 to mee:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
-        print_addr("L2 to mee: ", mf);
+        // print_addr("L2 to mee: ", mf);
         // mee to dram
         assert(mf->is_raw());
         // printf("TTTTTTTTTTTTTTTT\n");

From 4353173fa9b357ba64c4f13f493b2448fabc5931 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Tue, 8 Oct 2024 20:44:44 +0800
Subject: [PATCH 127/133] mee v1.3.3

---
 src/gpgpu-sim/gpu-sim.h |  1 +
 src/gpgpu-sim/l2cache.h |  2 +-
 src/gpgpu-sim/mee.cc    | 28 ++++++++++++++++++----------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index c389ee320..d2f5e16eb 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -351,6 +351,7 @@ class memory_config {
   unsigned gpgpu_frfcfs_dram_write_queue_size;
   unsigned write_high_watermark;
   unsigned write_low_watermark;
+  unsigned m_AES_Engines;
   unsigned m_crypto_latency;
   bool m_perf_sim_memcpy;
   bool simple_dram_model;
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 3dfb70227..2358dde85 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -346,7 +346,7 @@ class L2interface : public mem_fetch_interface {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
     m_unit->m_L2_mee_queue->push(mf);
     // if (mf->get_access_type() == 9)
-                        // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 to mee:", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+    // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\twr: %d\taccess type:%d\n", "L2 to mee:", mf->get_addr(), mf->get_sid(), mf->get_is_write(), mf->get_partition_addr(), mf->get_access_type());
 
     // printf("l2 to mee access type: %d\n",mf->get_access_type());
   }
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 40b72e9bb..39737f560 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -38,7 +38,10 @@ int decode(int addr) {
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
     if (m_unit->get_mpid() == 12) {
-        // printf("%saddr: %x\twr: %d\tdata_type: %d\tBMT_Layer: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", s, mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_BMT_Layer(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+        // printf("%s\t", s);
+        // if (mf->get_original_mf())
+        //     printf("original_addr: %x\toriginal_sp_addr: %x\t", mf->get_original_mf()->get_addr(), mf->get_original_mf()->get_partition_addr());
+        // printf("addr: %x\twr: %d\tdata_type: %d\tBMT_Layer: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_BMT_Layer(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
     }
 }
 
@@ -94,7 +97,7 @@ void mee::gen_CTR_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned
     // minor_addr = 128 + minor_addr * 7;
     // bool res = minor_addr & 7 > 1;
     // minor_addr >>= 3;
-    partition_addr = (partition_addr >> 14 << 7);
+    partition_addr = (partition_addr >> 7);
 
     // if (meta_acc == META_ACC)
     //     partition_addr |= minor_addr;
@@ -154,17 +157,18 @@ void mee::meta_access(
     mem_access_byte_mask_t byte_mask;
     mem_access_sector_mask_t sector_mask;
     unsigned data_size = 0;
-    for (unsigned i = addr & 127; i < (addr & 127) + size; i++) byte_mask.set(i);
     if (size == 128) {
         for (unsigned i = 0; i < size / 32; i++) 
             sector_mask.set(i);
         addr = addr >> 7 << 7;
+        for (unsigned i = addr & 127; i < (addr & 127) + size; i++) byte_mask.set(i);
         data_size = 128;
     }
     else {
         for (unsigned i = (addr >> 5) & 3; i < ((addr >> 5) & 3) + ((size + 31) / 32); i++) 
             sector_mask.set(i);
         addr = addr >> 5 << 5;
+        for (unsigned i = addr & 127; i < (addr & 127) + size; i++) byte_mask.set(i);
         data_size = 32;
         // sector_mask.set((addr >> 5) & 3);
     }
@@ -422,11 +426,8 @@ void mee::CTR_cycle() {
         } else {    //CTR读MISS返回，CTR写一定命中
             assert(!mf_return->is_write());
                 // print_addr("MISS OTP:\t\t", mf_return);
-            if (!m_OTP_queue->full() && !m_CTR_BMT_Buffer->full()) { //CTR读MISS，则应生成CTR to BMT任务
+            if (!m_OTP_queue->full()) { //CTR读MISS，则应生成CTR to BMT任务
                 m_OTP_queue->push(new unsigned(mf_return->get_id()));   //得到CTR值，计算OTP用于解密
-                #ifdef BMT_Enable
-                m_CTR_BMT_Buffer->push(mf_return);
-                #endif
                 m_CTR_RET_queue->pop();
             }
         }
@@ -548,6 +549,7 @@ void mee::MAC_cycle() {
             MAC_counter++;
         } else {
             print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
+            m_MACcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             // if (get_sub_partition_id(mf) == 0)
             //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             // print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
@@ -640,13 +642,19 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
     if (!m_unit->dram_mee_queue_empty(m_data_type)) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top(m_data_type);
         // print_addr("fill: \t", mf_return);
+        #ifdef BMT_Enable
+        if (m_data_type == CTR)
+            if (!m_META_RET_queue->full()) 
+                m_META_RET_queue->push(mf_return);
+            else
+                return;
+        #endif
         if ((mf_return->get_data_type() == m_data_type) && m_METAcache->waiting_for_fill(mf_return)) {
             // print_addr("wating for fill:\t\t", mf); 
             if (m_METAcache->fill_port_free()) {
                 // assert(mf->get_access_type() != META_WR_ALLOC_R);
                 m_METAcache->fill(mf_return, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                         m_memcpy_cycle_offset);
-                // if (m_data_type == MAC)
                 //     print_addr("MAC fill:\t", mf);
                 assert(!mf_return->is_write());
                 // if (m_METAcache == m_BMTcache)
@@ -684,7 +692,7 @@ void mee::simple_cycle(unsigned cycle) {
     META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[1]);
     // }
     // META_fill_responses(m_BMTcache);
-    META_fill(m_CTRcache, m_CTR_RET_queue, NULL, CTR_mask, CTR_base, CTR);
+    META_fill(m_CTRcache, m_CTR_BMT_Buffer, NULL, CTR_mask, CTR_base, CTR);
     META_fill(m_MACcache, m_MAC_RET_queue, NULL, MAC_mask, MAC_base, MAC);
     META_fill(m_BMTcache, m_BMT_RET_queue, NULL, BMT_mask[1], BMT_base[1], BMT);
 
@@ -747,7 +755,7 @@ void mee::simple_cycle(unsigned cycle) {
         
         if (((m_config->m_META_config.m_cache_type == SECTOR && !m_CTR_queue->full(8)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_CTR_queue->full(2)))
             && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
-            // assert(!mf->is_write());
+            assert(!mf->is_write());
             if (mf->is_write()) { // write
                 assert(mf->is_raw());
                 // printf("LLLLLLLLLLLLLLLLLLL");

From 5a65d18a1d8045c6fc8617cda638f829a4515a60 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Tue, 8 Oct 2024 21:03:04 +0800
Subject: [PATCH 128/133] mee v1.3.3

---
 ...mee_sector => gpgpusim.config_base_mee_sector_large_mdc_4KB} | 0
 ...mee_sector => gpgpusim.config_base_mee_sector_large_mdc_8KB} | 0
 .../accelwattch_ptx_sim.xml                                     | 0
 .../accelwattch_ptx_sim_alt.xml                                 | 0
 .../accelwattch_sass_hw.xml                                     | 0
 .../accelwattch_sass_hybrid.xml                                 | 0
 .../accelwattch_sass_sim.xml                                    | 0
 .../accelwattch_sass_sim_alt.xml                                | 0
 .../config_volta_islip.icnt                                     | 0
 .../gpgpusim.config_base_mee_sector_mdc_1x16}                   | 2 +-
 .../accelwattch_ptx_sim.xml                                     | 0
 .../accelwattch_ptx_sim_alt.xml                                 | 0
 .../accelwattch_sass_hw.xml                                     | 0
 .../accelwattch_sass_hybrid.xml                                 | 0
 .../accelwattch_sass_sim.xml                                    | 0
 .../accelwattch_sass_sim_alt.xml                                | 0
 .../config_volta_islip.icnt                                     | 0
 .../gpgpusim.config_base_mee_sector_mdc_2x8}                    | 2 +-
 ...base_mee_sector => gpgpusim.config_base_mee_sector_mshr_128} | 0
 ..._base_mee_sector => gpgpusim.config_base_mee_sector_mshr_32} | 0
 .../gpgpusim.config_base_mee_sector_mshr_64}                    | 0
 21 files changed, 2 insertions(+), 2 deletions(-)
 rename configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/{gpgpusim.config_base_mee_sector => gpgpusim.config_base_mee_sector_large_mdc_4KB} (100%)
 rename configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/{gpgpusim.config_base_mee_sector => gpgpusim.config_base_mee_sector_large_mdc_8KB} (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_1 => base_mee_sector_mdc_1x16}/accelwattch_ptx_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_1 => base_mee_sector_mdc_1x16}/accelwattch_ptx_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_1 => base_mee_sector_mdc_1x16}/accelwattch_sass_hw.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_1 => base_mee_sector_mdc_1x16}/accelwattch_sass_hybrid.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_1 => base_mee_sector_mdc_1x16}/accelwattch_sass_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_1 => base_mee_sector_mdc_1x16}/accelwattch_sass_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_1 => base_mee_sector_mdc_1x16}/config_volta_islip.icnt (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector => base_mee_sector_mdc_1x16/gpgpusim.config_base_mee_sector_mdc_1x16} (99%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_2 => base_mee_sector_mdc_2x8}/accelwattch_ptx_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_2 => base_mee_sector_mdc_2x8}/accelwattch_ptx_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_2 => base_mee_sector_mdc_2x8}/accelwattch_sass_hw.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_2 => base_mee_sector_mdc_2x8}/accelwattch_sass_hybrid.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_2 => base_mee_sector_mdc_2x8}/accelwattch_sass_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_2 => base_mee_sector_mdc_2x8}/accelwattch_sass_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_2 => base_mee_sector_mdc_2x8}/config_volta_islip.icnt (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_2/gpgpusim.config_base_mee_sector => base_mee_sector_mdc_2x8/gpgpusim.config_base_mee_sector_mdc_2x8} (99%)
 rename configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/{gpgpusim.config_base_mee_sector => gpgpusim.config_base_mee_sector_mshr_128} (100%)
 rename configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/{gpgpusim.config_base_mee_sector => gpgpusim.config_base_mee_sector_mshr_32} (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_AES_1/gpgpusim.config_base_mee_sector => base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64} (100%)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_ptx_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_ptx_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_ptx_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_ptx_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_ptx_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_hw.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hw.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_hw.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_hybrid.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_hybrid.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_hybrid.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/accelwattch_sass_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/config_volta_islip.icnt
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/config_volta_islip.icnt
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/config_volta_islip.icnt
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/gpgpusim.config_base_mee_sector_mdc_1x16
similarity index 99%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/gpgpusim.config_base_mee_sector_mdc_1x16
index 7c57ba2f2..4a0117d9c 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/gpgpusim.config_base_mee_sector_mdc_1x16
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:1:128:16,L:B:m:L:X,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_ptx_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_ptx_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_ptx_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_ptx_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_ptx_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_hw.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hw.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_hw.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_hybrid.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_hybrid.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_hybrid.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/accelwattch_sass_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/config_volta_islip.icnt
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/config_volta_islip.icnt
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/config_volta_islip.icnt
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/gpgpusim.config_base_mee_sector_mdc_2x8
similarity index 99%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/gpgpusim.config_base_mee_sector
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/gpgpusim.config_base_mee_sector_mdc_2x8
index 7c57ba2f2..42af8e8de 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_2/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/gpgpusim.config_base_mee_sector_mdc_2x8
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:2:128:8,L:B:m:L:X,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_AES_1/gpgpusim.config_base_mee_sector
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64

From 76045294bc221cabf31df721f371fd82b2e991b8 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Tue, 8 Oct 2024 21:07:10 +0800
Subject: [PATCH 129/133] mee v1.3.3

---
 src/gpgpu-sim/mee.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 39737f560..8a7875aca 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -755,7 +755,7 @@ void mee::simple_cycle(unsigned cycle) {
         
         if (((m_config->m_META_config.m_cache_type == SECTOR && !m_CTR_queue->full(8)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_CTR_queue->full(2)))
             && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
-            assert(!mf->is_write());
+            // assert(!mf->is_write());
             if (mf->is_write()) { // write
                 assert(mf->is_raw());
                 // printf("LLLLLLLLLLLLLLLLLLL");

From 5e01b129454c88006ce68aa6eaa66ee33688c466 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Thu, 24 Oct 2024 10:25:21 +0800
Subject: [PATCH 130/133] mee v1.3.4

---
 src/gpgpu-sim/l2cache.cc |  3 ++-
 src/gpgpu-sim/mee.cc     | 27 +++++++++++++++++++--------
 src/gpgpu-sim/mee.h      |  1 +
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index fe5aa985d..66ef7eb74 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -93,7 +93,7 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
   m_dram_mee_queue[TOT] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, 1);
   for (unsigned i = 1; i < NUM_DATA_TYPE; i++) { 
     m_mee_dram_queue[i] = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
-    m_dram_mee_queue[i] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, dram_L2);
+    m_dram_mee_queue[i] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, 256);
   }
 
   char CTRc_name[32];
@@ -289,6 +289,7 @@ void memory_partition_unit::mee_to_dram_cycle() {
   for (unsigned i = 1; i < NUM_DATA_TYPE; i++) { 
     unsigned dtype = i;
     if (m_mee_dram_queue[dtype]->get_n_element() >= send_trigger_threshold) {
+      if (m_dram_mee_queue[dtype]->get_n_element() >= receive_stop_threshold) continue;
       m_mee_dram_queue[TOT]->push(m_mee_dram_queue[dtype]->top());
       m_mee_dram_queue[dtype]->pop();
       return;
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 8a7875aca..69c801483 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -22,10 +22,10 @@ mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class me
     m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len + 100);
 
     m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", m_config->m_crypto_latency, m_config->m_crypto_latency + len);
-    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 4);
 
     m_HASH_queue = new fifo_pipeline<hash>("meta-queue", m_config->m_crypto_latency, m_config->m_crypto_latency + len);
-    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 4);
 
     // m_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
     m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
@@ -37,7 +37,7 @@ int decode(int addr) {
     return (addr & 16128) >> 8;
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
-    if (m_unit->get_mpid() == 12) {
+    if (m_unit->get_mpid() == 1) {
         // printf("%s\t", s);
         // if (mf->get_original_mf())
         //     printf("original_addr: %x\toriginal_sp_addr: %x\t", mf->get_original_mf()->get_addr(), mf->get_original_mf()->get_partition_addr());
@@ -286,6 +286,7 @@ void mee::AES_cycle() {
                 // m_OTP_table[REQ_addr] = 0;
                 // print_addr("mee to L2 R:\t", mf);
                 m_unit->mee_L2_queue_push(spid, mf);    //解密完后返回L2
+                print_addr("MEE to L2:\t", mf);
                 // printf("JJJJJJJJJJJJJJJJJJJJJJJJJ");
                 m_AES_queue->pop();
                 
@@ -626,7 +627,7 @@ void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_f
             m_META_RET_queue->push(mf);
         // assert(mf->get_access_type() == META_ACC);
         // if (m_METAcache == m_BMTcache)
-        // print_addr("fill responses:\t", mf);
+        print_addr("fill responses:\t", mf);
         // reply(m_METAcache, mf);
         // delete mf;
     } else {
@@ -641,7 +642,7 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
     
     if (!m_unit->dram_mee_queue_empty(m_data_type)) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top(m_data_type);
-        // print_addr("fill: \t", mf_return);
+        print_addr("fill: \t", mf_return);
         #ifdef BMT_Enable
         if (m_data_type == CTR)
             if (!m_META_RET_queue->full()) 
@@ -714,7 +715,7 @@ void mee::simple_cycle(unsigned cycle) {
             m_unit->dram_mee_queue_pop(NORM);
         } else {
         
-            // print_addr("dram_mee_queue_top:\t", mf_return);
+            print_addr("dram to mee:\t", mf_return);
             // mee to L2
             
             // META_fill(m_MACcache, mf_return, MAC_mask);
@@ -748,13 +749,15 @@ void mee::simple_cycle(unsigned cycle) {
         // if (mf->get_access_type() == 9)
                         // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 to mee:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
 
-        // print_addr("L2 to mee: ", mf);
+        
         // mee to dram
         assert(mf->is_raw());
         // printf("TTTTTTTTTTTTTTTT\n");
         
         if (((m_config->m_META_config.m_cache_type == SECTOR && !m_CTR_queue->full(8)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_CTR_queue->full(2)))
             && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
+            print_addr("L2 to mee: ", mf);
+            DL_CNT = 0;
             // assert(!mf->is_write());
             if (mf->is_write()) { // write
                 assert(mf->is_raw());
@@ -814,6 +817,10 @@ void mee::simple_cycle(unsigned cycle) {
                 m_unit->L2_mee_queue_pop(cycle&1);
             }
         } else {
+            DL_CNT++;
+            if (DL_CNT >= 10000) {
+                printf("DEAD LOCK! \n");
+            }
             // if (m_unit->get_mpid() == 0){
             //     if (m_CTR_RET_queue->full())
             //         printf("AAAAAAAAAAAAAAAAAAAAAA");
@@ -901,4 +908,8 @@ void mee::cycle(unsigned cycle) {
 
 //CTR_counter <= BMT_counter 
 //CT_counter  < OTP_counter
-//MAC_counter < CT_counter
\ No newline at end of file
+//MAC_counter < CT_counter
+
+
+//实现一个中间类，bridge
+//
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 44d11dd52..d6041ca1a 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -122,5 +122,6 @@ class mee {
         unsigned CTR_counter = 0;
         unsigned BMT_counter = 0;
         int var;
+        unsigned DL_CNT = 0;
 
 };
\ No newline at end of file

From 542535014943e88d9c3778a7edfd1e1c740aa91a Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 2 Nov 2024 01:22:15 +0800
Subject: [PATCH 131/133] mee v1.3.9

---
 .../gpgpusim.config_base_mee_normal           |  2 +-
 .../gpgpusim.config_base_mee_sector           |  4 +-
 .../gpgpusim.config_base_mee_sector_L2_4MB    |  2 +-
 .../gpgpusim.config_base_mee_sector           |  2 +-
 .../accelwattch_ptx_sim.xml                   |  0
 .../accelwattch_ptx_sim_alt.xml               |  0
 .../accelwattch_sass_hw.xml                   |  0
 .../accelwattch_sass_hybrid.xml               |  0
 .../accelwattch_sass_sim.xml                  |  0
 .../accelwattch_sass_sim_alt.xml              |  0
 .../config_volta_islip.icnt                   |  0
 ...sim.config_base_mee_sector_large_mdc_16KB} |  2 +-
 .../gpgpusim.config_base_mee_sector           |  2 +-
 .../accelwattch_ptx_sim.xml                   |  0
 .../accelwattch_ptx_sim_alt.xml               |  0
 .../accelwattch_sass_hw.xml                   |  0
 .../accelwattch_sass_hybrid.xml               |  0
 .../accelwattch_sass_sim.xml                  |  0
 .../accelwattch_sass_sim_alt.xml              |  0
 .../config_volta_islip.icnt                   |  0
 ...sim.config_base_mee_sector_large_mdc_32KB} |  2 +-
 ...pusim.config_base_mee_sector_large_mdc_4KB |  2 +-
 ...usim.config_base_mee_sector_large_mdc_64KB |  2 +-
 ...pusim.config_base_mee_sector_large_mdc_8KB |  2 +-
 .../accelwattch_ptx_sim.xml                   |  0
 .../accelwattch_ptx_sim_alt.xml               |  0
 .../accelwattch_sass_hw.xml                   |  0
 .../accelwattch_sass_hybrid.xml               |  0
 .../accelwattch_sass_sim.xml                  |  0
 .../accelwattch_sass_sim_alt.xml              |  0
 .../config_volta_islip.icnt                   |  0
 .../gpgpusim.config_base_mee_sector_mdc_4x16} |  3 +-
 .../accelwattch_ptx_sim.xml                   |  0
 .../accelwattch_ptx_sim_alt.xml               |  0
 .../accelwattch_sass_hw.xml                   |  0
 .../accelwattch_sass_hybrid.xml               |  0
 .../accelwattch_sass_sim.xml                  |  0
 .../accelwattch_sass_sim_alt.xml              |  0
 .../config_volta_islip.icnt                   |  0
 .../gpgpusim.config_base_mee_sector_mdc_4x8}  |  2 +-
 src/gpgpu-sim/gpu-cache.cc                    | 14 ++--
 src/gpgpu-sim/gpu-cache.h                     |  4 +-
 src/gpgpu-sim/hashing.cc                      | 36 ++++++++-
 src/gpgpu-sim/l2cache.cc                      | 17 ++++-
 src/gpgpu-sim/l2cache.h                       |  2 +
 src/gpgpu-sim/mee.cc                          | 75 ++++++++++++-------
 src/gpgpu-sim/mee.h                           |  2 +
 47 files changed, 123 insertions(+), 54 deletions(-)
 rename configs/tested-cfgs/SM7_QV100/{base_mee => base_mee_sector_large_mdc_16KB}/accelwattch_ptx_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee => base_mee_sector_large_mdc_16KB}/accelwattch_ptx_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee => base_mee_sector_large_mdc_16KB}/accelwattch_sass_hw.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee => base_mee_sector_large_mdc_16KB}/accelwattch_sass_hybrid.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee => base_mee_sector_large_mdc_16KB}/accelwattch_sass_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee => base_mee_sector_large_mdc_16KB}/accelwattch_sass_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee => base_mee_sector_large_mdc_16KB}/config_volta_islip.icnt (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_1x16/gpgpusim.config_base_mee_sector_mdc_1x16 => base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB} (99%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_0crypto => base_mee_sector_large_mdc_32KB}/accelwattch_ptx_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_0crypto => base_mee_sector_large_mdc_32KB}/accelwattch_ptx_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_0crypto => base_mee_sector_large_mdc_32KB}/accelwattch_sass_hw.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_0crypto => base_mee_sector_large_mdc_32KB}/accelwattch_sass_hybrid.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_0crypto => base_mee_sector_large_mdc_32KB}/accelwattch_sass_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_0crypto => base_mee_sector_large_mdc_32KB}/accelwattch_sass_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_0crypto => base_mee_sector_large_mdc_32KB}/config_volta_islip.icnt (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee/gpgpusim.config_base_mee => base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB} (99%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_1x16 => base_mee_sector_mdc_4x16}/accelwattch_ptx_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_1x16 => base_mee_sector_mdc_4x16}/accelwattch_ptx_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_1x16 => base_mee_sector_mdc_4x16}/accelwattch_sass_hw.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_1x16 => base_mee_sector_mdc_4x16}/accelwattch_sass_hybrid.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_1x16 => base_mee_sector_mdc_4x16}/accelwattch_sass_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_1x16 => base_mee_sector_mdc_4x16}/accelwattch_sass_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_1x16 => base_mee_sector_mdc_4x16}/config_volta_islip.icnt (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_0crypto/gpgpusim.config_base_mee_sector_0crypto => base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16} (99%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_2x8 => base_mee_sector_mdc_4x8}/accelwattch_ptx_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_2x8 => base_mee_sector_mdc_4x8}/accelwattch_ptx_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_2x8 => base_mee_sector_mdc_4x8}/accelwattch_sass_hw.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_2x8 => base_mee_sector_mdc_4x8}/accelwattch_sass_hybrid.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_2x8 => base_mee_sector_mdc_4x8}/accelwattch_sass_sim.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_2x8 => base_mee_sector_mdc_4x8}/accelwattch_sass_sim_alt.xml (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_2x8 => base_mee_sector_mdc_4x8}/config_volta_islip.icnt (100%)
 rename configs/tested-cfgs/SM7_QV100/{base_mee_sector_mdc_2x8/gpgpusim.config_base_mee_sector_mdc_2x8 => base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8} (99%)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
index b188ece63..82e8410e2 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
--gpgpu_cache:dmeta N:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta N:4:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
index 7c57ba2f2..3ad0563ba 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
@@ -165,8 +165,8 @@
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
-#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:P,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
index daca760c7..f7149bf16 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:16,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
index 7c57ba2f2..74f0caa86 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_ptx_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_ptx_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hw.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_hw.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hw.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hybrid.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_hybrid.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hybrid.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee/accelwattch_sass_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/config_volta_islip.icnt
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee/config_volta_islip.icnt
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/config_volta_islip.icnt
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/gpgpusim.config_base_mee_sector_mdc_1x16 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB
similarity index 99%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/gpgpusim.config_base_mee_sector_mdc_1x16
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB
index 4a0117d9c..68172310b 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/gpgpusim.config_base_mee_sector_mdc_1x16
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:1:128:16,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:32:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
index 7c57ba2f2..9f75f7d93 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,P:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_ptx_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hw.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hw.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hw.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hybrid.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_hybrid.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hybrid.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/accelwattch_sass_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/config_volta_islip.icnt
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/config_volta_islip.icnt
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/config_volta_islip.icnt
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB
similarity index 99%
rename from configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB
index c344ca2ca..cc24459c0 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee/gpgpusim.config_base_mee
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
--gpgpu_cache:dmeta N:4:128:4,L:B:m:L:X,A:8:4,32:0,32
+-gpgpu_cache:dmeta S:64:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
index aa433f7ff..b8fdd00ba 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:8:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:8:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
index e238c00c9..0806c9e3d 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:32:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:128:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
index 1d3198516..dfe15529c 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:16:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:16:128:4,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_ptx_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_ptx_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hw.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_hw.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hw.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hybrid.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_hybrid.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hybrid.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/accelwattch_sass_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/config_volta_islip.icnt
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_1x16/config_volta_islip.icnt
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/config_volta_islip.icnt
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/gpgpusim.config_base_mee_sector_0crypto b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16
similarity index 99%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/gpgpusim.config_base_mee_sector_0crypto
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16
index 6f5168f4d..f7ebd50cd 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_0crypto/gpgpusim.config_base_mee_sector_0crypto
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16
@@ -166,8 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
--gpgpu_crypto_latency 0
+-gpgpu_cache:dmeta S:4:128:16,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_ptx_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_ptx_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hw.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_hw.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hw.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hybrid.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_hybrid.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hybrid.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_sim.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim_alt.xml
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/accelwattch_sass_sim_alt.xml
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim_alt.xml
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/config_volta_islip.icnt
similarity index 100%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/config_volta_islip.icnt
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/config_volta_islip.icnt
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/gpgpusim.config_base_mee_sector_mdc_2x8 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8
similarity index 99%
rename from configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/gpgpusim.config_base_mee_sector_mdc_2x8
rename to configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8
index 42af8e8de..6b1bc1591 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_2x8/gpgpusim.config_base_mee_sector_mdc_2x8
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8
@@ -166,7 +166,7 @@
 #-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 #-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
--gpgpu_cache:dmeta S:2:128:8,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dmeta S:4:128:8,L:B:m:L:P,A:64:64,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 0
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index bcc7f989c..62966155b 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -335,7 +335,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
                                             unsigned &idx, mem_fetch *mf) {
   bool wb = false;
   evicted_block_info evicted;
-  enum cache_request_status result = access(addr, time, idx, wb, evicted, mf);
+  enum cache_request_status result = access(addr, time, idx, wb, evicted, mf, false);
   assert(!wb);
   return result;
 }
@@ -343,11 +343,13 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
 enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
                                             unsigned &idx, bool &wb,
                                             evicted_block_info &evicted,
-                                            mem_fetch *mf) {
+                                            mem_fetch *mf, bool mshr_hit_avail) {
   m_access++;
   is_used = true;
   shader_cache_access_log(m_core_id, m_type_id, 0);  // log accesses to cache
   enum cache_request_status status = probe(addr, idx, mf, mf->is_write());
+  if (mshr_hit_avail && status == MISS && !mf->get_is_write())
+    status = HIT_RESERVED;
   switch (status) {
     case HIT_RESERVED:
       m_pending_hit++;
@@ -1163,7 +1165,7 @@ void baseline_cache::send_read_request(new_addr_type addr,
     if (read_only)
       m_tag_array->access(block_addr, time, cache_index, mf);
     else
-      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
+      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf, mshr_hit & mshr_avail);
 
     m_mshrs.add(mshr_addr, mf);
     m_stats.inc_stats(mf->get_access_type(), MSHR_HIT);
@@ -1174,7 +1176,7 @@ void baseline_cache::send_read_request(new_addr_type addr,
     if (read_only)
       m_tag_array->access(block_addr, time, cache_index, mf);
     else
-      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
+      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf, mshr_hit & mshr_avail);
 
     m_mshrs.add(mshr_addr, mf);
     m_extra_mf_fields[mf] = extra_mf_fields(
@@ -1414,7 +1416,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
     evicted_block_info evicted;
 
     cache_request_status status =
-        m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
+        m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf, false);
     assert(status != HIT);
     cache_block_t *block = m_tag_array->get_block(cache_index);
     if (!block->is_modified_line()) {
@@ -1547,7 +1549,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   evicted_block_info evicted;
 
   cache_request_status m_status =
-      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
+      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf, false);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
   if (!block->is_modified_line()) {
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index d4a5c244b..a0899834e 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -944,7 +944,7 @@ class tag_array {
                                    unsigned &idx, mem_fetch *mf);
   enum cache_request_status access(new_addr_type addr, unsigned time,
                                    unsigned &idx, bool &wb,
-                                   evicted_block_info &evicted, mem_fetch *mf);
+                                   evicted_block_info &evicted, mem_fetch *mf, bool mshr_hit_avail);
 
   void fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
@@ -1004,7 +1004,7 @@ class tag_array {
   typedef tr1_hash_map<new_addr_type, unsigned> line_table;
   line_table pending_lines;
 
-  friend class basline_cache;
+  friend class baseline_cache;
   friend class l2_cache;
   friend class mee;
 };
diff --git a/src/gpgpu-sim/hashing.cc b/src/gpgpu-sim/hashing.cc
index f566aa471..514b46a6e 100644
--- a/src/gpgpu-sim/hashing.cc
+++ b/src/gpgpu-sim/hashing.cc
@@ -35,7 +35,41 @@ unsigned ipoly_hash_function(new_addr_type higher_bits, unsigned index,
    * exit in GPGPU applications and also show good performance for other
    * strides.
    */
-  if (bank_set_num == 16) {
+  if (bank_set_num == 2) {
+    std::bitset<64> a(higher_bits);
+    std::bitset<1> b(index);
+    std::bitset<1> new_index(index);
+
+    new_index[0] = a[11] ^ a[9] ^ a[5] ^ a[4] ^ a[3] ^ a[2] ^ a[0] ^ b[0];
+
+    return new_index.to_ulong();
+
+  } if (bank_set_num == 4) {
+    std::bitset<64> a(higher_bits);
+    std::bitset<2> b(index);
+    std::bitset<2> new_index(index);
+
+    new_index[0] =
+        a[10] ^ a[9] ^ a[7] ^ a[6] ^ a[4] ^ a[3] ^ a[1] ^ a[0] ^ b[0];
+    new_index[1] =
+        a[9] ^ a[8] ^ a[6] ^ a[5] ^ a[3] ^ a[2] ^ a[1] ^ a[0] ^ b[1];
+
+    return new_index.to_ulong();
+
+  } if (bank_set_num == 8) {
+    std::bitset<64> a(higher_bits);
+    std::bitset<3> b(index);
+    std::bitset<3> new_index(index);
+
+    new_index[0] =
+        a[11] ^ a[10] ^ a[9] ^ a[7] ^ a[4] ^ a[3] ^ a[2] ^ a[0] ^ b[0];
+    new_index[1] =
+        a[12] ^ a[9] ^ a[8] ^ a[7] ^ a[5] ^ a[2] ^ a[1] ^ a[0] ^ b[1];
+    new_index[2] = a[10] ^ a[9] ^ a[8] ^ a[6] ^ a[3] ^ a[2] ^ a[1] ^ b[2];
+
+    return new_index.to_ulong();
+
+  } if (bank_set_num == 16) {
     std::bitset<64> a(higher_bits);
     std::bitset<4> b(index);
     std::bitset<4> new_index(index);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 66ef7eb74..7e4d9a605 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -93,7 +93,7 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
   m_dram_mee_queue[TOT] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, 1);
   for (unsigned i = 1; i < NUM_DATA_TYPE; i++) { 
     m_mee_dram_queue[i] = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
-    m_dram_mee_queue[i] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, 256);
+    m_dram_mee_queue[i] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, 128);
   }
 
   char CTRc_name[32];
@@ -289,8 +289,11 @@ void memory_partition_unit::mee_to_dram_cycle() {
   for (unsigned i = 1; i < NUM_DATA_TYPE; i++) { 
     unsigned dtype = i;
     if (m_mee_dram_queue[dtype]->get_n_element() >= send_trigger_threshold) {
-      if (m_dram_mee_queue[dtype]->get_n_element() >= receive_stop_threshold) continue;
+      if (m_n_mf[dtype] + m_dram_mee_queue[dtype]->get_n_element() >= receive_stop_threshold) continue;
       m_mee_dram_queue[TOT]->push(m_mee_dram_queue[dtype]->top());
+      m_n_mf[dtype]++;
+      // if (get_mpid() == 14)
+      //   printf("mpid: %d m_n_mf[%d]=%d append %x acc_type: %d\n", get_mpid(), dtype, m_n_mf[dtype], m_mee_dram_queue[dtype]->top()->get_addr(), m_mee_dram_queue[dtype]->top()->get_access_type());
       m_mee_dram_queue[dtype]->pop();
       return;
     }
@@ -300,8 +303,11 @@ void memory_partition_unit::mee_to_dram_cycle() {
     unsigned dtype = (i + last_send + 1) % NUM_DATA_TYPE;
     if (dtype == 0) continue;
     if (m_mee_dram_queue[dtype]->empty()) continue;
-    if (m_dram_mee_queue[dtype]->get_n_element() >= receive_stop_threshold) continue;
+    if (m_n_mf[dtype] + m_dram_mee_queue[dtype]->get_n_element() >= receive_stop_threshold) continue;
     m_mee_dram_queue[TOT]->push(m_mee_dram_queue[dtype]->top());
+    m_n_mf[dtype]++;
+    // if (get_mpid() == 14)
+    //   printf("mpid: %d m_n_mf[%d]=%d append %x acc_type: %d\n", get_mpid(), dtype, m_n_mf[dtype], m_mee_dram_queue[dtype]->top()->get_addr(), m_mee_dram_queue[dtype]->top()->get_access_type());
     m_mee_dram_queue[dtype]->pop();
     last_send = dtype;
     return;
@@ -309,10 +315,14 @@ void memory_partition_unit::mee_to_dram_cycle() {
 }
 
 void memory_partition_unit::dram_to_mee_cycle() {
+  //L2_WRBK_ACC在DRAM中被Delete，不会发往mee
   if (m_dram_mee_queue[TOT]->empty()) return;
   mem_fetch *mf_return = m_dram_mee_queue[TOT]->top();
   if (!m_dram_mee_queue[mf_return->get_data_type()]->full()) {
     m_dram_mee_queue[mf_return->get_data_type()]->push(mf_return);
+    m_n_mf[mf_return->get_data_type()]--;
+    // if (get_mpid() == 14)
+    //   printf("mpid: %d m_n_mf[%d]=%d pop %x acc_type: %d\n", get_mpid(), mf_return->get_data_type(), m_n_mf[mf_return->get_data_type()], mf_return->get_addr(), mf_return->get_access_type());
     m_dram_mee_queue[TOT]->pop();
   }
 }
@@ -487,6 +497,7 @@ void memory_partition_unit::set_done(mem_fetch *mf) {
   assert(m_sub_partition[spid]->get_id() == global_spid);
   if (mf->get_access_type() == L1_WRBK_ACC ||
       mf->get_access_type() == L2_WRBK_ACC) {
+    m_n_mf[mf->get_data_type()]--;
     m_arbitration_metadata.return_credit(spid);
     MEMPART_DPRINTF(
         "mem_fetch request %p return from dram to sub partition %d\n", mf,
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 2358dde85..272e60e9f 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -157,10 +157,12 @@ class memory_partition_unit {
   unsigned long long m_cache_MAC_acc;
   unsigned long long m_cache_BMT_acc;
   unsigned long long m_cache_meta_wb;
+  void pop_n_mf(enum data_type dtype) { m_n_mf[dtype]--; }
 
  private:
   fifo_pipeline<mem_fetch> *m_mee_dram_queue[5]; 
   fifo_pipeline<mem_fetch> *m_dram_mee_queue[5]; 
+  unsigned m_n_mf[5] = {0, 0, 0, 0, 0};
   const unsigned send_trigger_threshold = 16;
   const unsigned receive_stop_threshold = 16;
   unsigned last_send = 0;
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index 69c801483..d60d7e56d 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -37,23 +37,33 @@ int decode(int addr) {
     return (addr & 16128) >> 8;
 }
 void mee::print_addr(char s[], mem_fetch *mf) {
-    if (m_unit->get_mpid() == 1) {
-        // printf("%s\t", s);
-        // if (mf->get_original_mf())
-        //     printf("original_addr: %x\toriginal_sp_addr: %x\t", mf->get_original_mf()->get_addr(), mf->get_original_mf()->get_partition_addr());
-        // printf("addr: %x\twr: %d\tdata_type: %d\tBMT_Layer: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_BMT_Layer(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
-    }
+    // if (m_unit->get_mpid() == 14) {
+    //     printf("%s\t", s);
+    //     if (mf->get_original_mf())
+    //         printf("original_addr: %x\toriginal_sp_addr: %x\t", mf->get_original_mf()->get_addr(), mf->get_original_mf()->get_partition_addr());
+    //     printf("addr: %x\twr: %d\tdata_type: %d\tBMT_Layer: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_BMT_Layer(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+    // }
+}
+
+void mee::print_status(class meta_cache *m_METAcache, mem_fetch *mf) {
+    // if (m_unit->get_mpid() == 14) {
+    //     unsigned idx = m_METAcache->m_config.set_index(mf->get_addr());
+    //     enum cache_request_status status = m_METAcache->m_tag_array->probe(mf->get_addr(), idx, mf->get_access_sector_mask(), mf->is_write());
+    //     printf("idx is %u\t", idx);
+    //     printf("sector mask is %u\n", mf->get_access_sector_mask().to_ulong());
+    //     m_METAcache->m_tag_array->m_lines[idx]->print_status();
+    // }
 }
 
 void mee::print_tag() {
     // if (get_sub_partition_id(mf) == 0) {
         // for (unsigned i = 0; i < m_config->m_META_config.get_num_lines(); i++) {
         for (unsigned i = 188; i < 192; i++) {
-            printf("line %d:\t", i);
-            for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
-                // printf("%d\t", 
-                m_CTRcache->m_tag_array->m_lines[i]->print_status();
-            printf("\n");
+            // printf("line %d:\t", i);
+            // for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
+            //     // printf("%d\t", 
+            //     m_CTRcache->m_tag_array->m_lines[i]->print_status();
+            // printf("\n");
         }
     // }
 }
@@ -314,14 +324,10 @@ void mee::MAC_CHECK_cycle() {
     if (!m_MAC_CHECK_queue->empty()) {
         // printf("AAAAAAAAAAAAA\n");
         mem_fetch *mf = m_MAC_CHECK_queue->top();
-        // print_addr("waiting for MAC Check:\t", mf);
-        new_addr_type REQ_addr = (new_addr_type) mf->get_original_mf();    //MAC Cache的值
         unsigned HASH_id = mf->get_id();    //MAC Hash值
-        // if (mf->get_sub_partition_id() == 0) 
-        //     printf("%x\n", OTP_addr);
         assert(HASH_id);
         if (m_MAC_set[HASH_id]) { //得到了MAC与Hash值，MAC Check完成
-            if (m_unit->get_mpid() == 12)
+            // if (m_unit->get_mpid() == 12)
             // printf("MAC check: id %d sid %d\n", HASH_id, mf->get_sub_partition_id());
             m_MAC_set[HASH_id]--;
             // m_MAC_table[REQ_addr] = 0;
@@ -342,7 +348,7 @@ void mee::MAC_CHECK_cycle() {
             //     printf("type:%d HASH :%d\n", mf->first, mf->get_id());
             if (mf->first == MAC)
                 m_MAC_set[mf->second]++; //MAC Hash计算完成
-            if (mf->first >= BMT)
+            if (mf->first == BMT)
                 m_BMT_set[mf->second]++; //BMT Hash计算完成
             m_HASH_queue->pop();
         }
@@ -362,7 +368,7 @@ void mee::BMT_CHECK_cycle() {
         // if (mf->get_sub_partition_id() == 0) 
         //     printf("%x\n", OTP_addr);
         // assert(mf);
-        if (m_BMT_set[HASH_id] && ((m_config->m_META_config.m_cache_type == SECTOR && !m_BMT_queue->full(5)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_BMT_queue->full(2)))) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
+        if (m_BMT_set[HASH_id] && ((m_config->m_META_config.m_cache_type == SECTOR && !m_BMT_queue->full(2)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_BMT_queue->full(2)))) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
             m_BMT_set[HASH_id]--;
             m_BMT_CHECK_queue->pop();
             print_addr("BMT Hash:\t", mf);
@@ -370,6 +376,7 @@ void mee::BMT_CHECK_cycle() {
             if (mf->get_BMT_Layer() == BMT_L4) {
                 // printf("AAAAAAAAAAAA\n");
                 BMT_busy = false;
+                m_n_reqs_in_BMT--;
                 if (mf->get_id())
                     BMT_counter++;
             } else {
@@ -391,6 +398,8 @@ void mee::BMT_CHECK_cycle() {
                     }
                 }
             }
+            // if (m_unit->get_mpid() == 13)
+            //     printf("BMT_queue size = %d\n", m_BMT_queue->get_n_element());
         }
     }
 
@@ -407,14 +416,18 @@ void mee::BMT_CHECK_cycle() {
     // }
 
     // CTR to BMT
-    if (!m_CTR_BMT_Buffer->empty() && !m_BMT_CHECK_queue->full() && !m_HASH_queue->full()) {
+    if (!m_CTR_BMT_Buffer->empty() && m_n_reqs_in_BMT < 64 && !m_HASH_queue->full()) {
         // assert(cnt);
         mem_fetch *mf = m_CTR_BMT_Buffer->top();
             // gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, mf->get_id());
-            m_BMT_CHECK_queue->push(mf);
-            m_HASH_queue->push(new hash(BMT, mf->get_id()));
-            m_CTR_BMT_Buffer->pop();
-            BMT_busy = true;
+        print_addr("CTR to BMT:\t", mf);
+        // if (m_unit->get_mpid() == 13)
+        //     printf("BMT_CHECK_queue size = %d\n", m_BMT_CHECK_queue->get_n_element());
+        m_n_reqs_in_BMT++;
+        m_BMT_CHECK_queue->push(mf);
+        m_HASH_queue->push(new hash(BMT, mf->get_id()));
+        m_CTR_BMT_Buffer->pop();
+        BMT_busy = true;
     }
 }
 
@@ -537,12 +550,15 @@ void mee::MAC_cycle() {
             } else {
                 m_MAC_CHECK_queue->push(mf);    //MAC读HIT，得到MAC值，发往MAC Check
             }
+            print_addr("MAC cycle access HIT:\t", mf);
+            print_status(m_MACcache, mf);
             m_MAC_queue->pop();
             MAC_counter++;
             // }
         } else if (status != RESERVATION_FAIL) {
             // set wating for CTR fill
-            print_addr("MAC cycle access MISS:\t\t", mf);
+            print_addr("MAC cycle access MISS:\t", mf);
+            print_status(m_MACcache, mf);
             if (mf->is_write()) {   //MAC写MISS，则MAC Hash值使用结束
                 // m_MAC_set[mf->get_id()]--;
             }
@@ -550,7 +566,8 @@ void mee::MAC_cycle() {
             MAC_counter++;
         } else {
             print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
-            m_MACcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+            print_status(m_MACcache, mf);
+            // m_MACcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             // if (get_sub_partition_id(mf) == 0)
             //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
             // print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
@@ -642,9 +659,8 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
     
     if (!m_unit->dram_mee_queue_empty(m_data_type)) {
         mem_fetch *mf_return = m_unit->dram_mee_queue_top(m_data_type);
-        print_addr("fill: \t", mf_return);
         #ifdef BMT_Enable
-        if (m_data_type == CTR)
+        if (m_data_type == CTR && mf_return->get_access_type() == META_ACC)
             if (!m_META_RET_queue->full()) 
                 m_META_RET_queue->push(mf_return);
             else
@@ -654,8 +670,11 @@ void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_M
             // print_addr("wating for fill:\t\t", mf); 
             if (m_METAcache->fill_port_free()) {
                 // assert(mf->get_access_type() != META_WR_ALLOC_R);
+                print_addr("fill: \t\t", mf_return);
                 m_METAcache->fill(mf_return, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                         m_memcpy_cycle_offset);
+                if (m_METAcache == m_MACcache)
+                    print_status(m_METAcache, mf_return);
                 //     print_addr("MAC fill:\t", mf);
                 assert(!mf_return->is_write());
                 // if (m_METAcache == m_BMTcache)
@@ -819,7 +838,7 @@ void mee::simple_cycle(unsigned cycle) {
         } else {
             DL_CNT++;
             if (DL_CNT >= 10000) {
-                printf("DEAD LOCK! \n");
+                printf("DEAD LOCK! mpid: %d\n", m_unit->get_mpid());
             }
             // if (m_unit->get_mpid() == 0){
             //     if (m_CTR_RET_queue->full())
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index d6041ca1a..1272dd830 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -16,6 +16,7 @@ class mee {
         void cycle(unsigned cycle);
         void simple_cycle(unsigned cycle);
         void print_addr(char s[], mem_fetch *mf);
+        void print_status(class meta_cache *m_METAcache, mem_fetch *mf);
         void print_tag();
         void meta_access(fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, 
             unsigned size, bool wr, unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc, 
@@ -121,6 +122,7 @@ class mee {
         unsigned MAC_counter = 0;
         unsigned CTR_counter = 0;
         unsigned BMT_counter = 0;
+        unsigned m_n_reqs_in_BMT = 0;
         int var;
         unsigned DL_CNT = 0;
 

From 338da15776e2e47ae87aea5b6b40f115a2430970 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Sat, 23 Nov 2024 10:31:49 +0800
Subject: [PATCH 132/133] mee v1.3.9.1

---
 .../base_mee_normal/gpgpusim.config_base_mee_normal       | 2 +-
 .../base_mee_sector/gpgpusim.config_base_mee_sector       | 2 +-
 .../gpgpusim.config_base_mee_sector_L2_4MB                | 2 +-
 .../gpgpusim.config_base_mee_sector                       | 2 +-
 .../gpgpusim.config_base_mee_sector_large_mdc_16KB        | 2 +-
 .../gpgpusim.config_base_mee_sector                       | 2 +-
 .../gpgpusim.config_base_mee_sector_large_mdc_32KB        | 4 ++--
 .../gpgpusim.config_base_mee_sector_large_mdc_4KB         | 2 +-
 .../gpgpusim.config_base_mee_sector_large_mdc_64KB        | 2 +-
 .../gpgpusim.config_base_mee_sector_large_mdc_8KB         | 2 +-
 .../gpgpusim.config_base_mee_sector_mdc_4x16              | 2 +-
 .../gpgpusim.config_base_mee_sector_mdc_4x8               | 2 +-
 .../gpgpusim.config_base_mee_sector_mshr_128              | 2 +-
 .../gpgpusim.config_base_mee_sector_mshr_32               | 2 +-
 .../gpgpusim.config_base_mee_sector_mshr_64               | 2 +-
 .../gpgpusim.config_base_mee_sector                       | 2 +-
 src/cuda-sim/ptx_loader.cc                                | 8 ++++----
 src/gpgpu-sim/l2cache.h                                   | 4 ++--
 18 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
index 82e8410e2..08fe73486 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
index 3ad0563ba..f3ecca62b 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
index f7149bf16..23ce56ad6 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
index 74f0caa86..c468dd8bc 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB
index 68172310b..736c5d7f8 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
index 9f75f7d93..d09d76b43 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB
index cc24459c0..775755039 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
@@ -150,7 +150,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 #-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,S:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
index b8fdd00ba..cc2a9a55d 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
index 0806c9e3d..d7d1124e7 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
index dfe15529c..62fd4494c 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16
index f7ebd50cd..9786d2436 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8
index 6b1bc1591..f0ce0c712 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128
index 4503d3682..1c39f19da 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32
index f48e9c9d7..06bb4a298 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64
index 7c57ba2f2..d0316d715 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector
index 7c57ba2f2..d0316d715 100644
--- a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector
@@ -42,7 +42,7 @@
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
 -gpgpu_ptx_force_max_capability 70 
--gpgpu_max_cycle 4000000
+-gpgpu_max_cycle 2000000
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
diff --git a/src/cuda-sim/ptx_loader.cc b/src/cuda-sim/ptx_loader.cc
index 4e91763e8..fa304b316 100644
--- a/src/cuda-sim/ptx_loader.cc
+++ b/src/cuda-sim/ptx_loader.cc
@@ -354,7 +354,7 @@ void gpgpu_context::gpgpu_ptx_info_load_from_filename(const char *filename,
     snprintf(extra_flags, 1024, "--compile-only --gpu-name=sm_%u", sm_version);
   snprintf(
       buff, 1024,
-      "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  /dev/null 2> %s",
+      "$CUDA_INSTALL_PATH/bin/ptxas -w %s -v %s --output-file  /dev/null 2> %s",
       extra_flags, filename, ptxas_filename.c_str());
   int result = system(buff);
   if (result != 0) {
@@ -441,7 +441,7 @@ void gpgpu_context::gpgpu_ptxinfo_load_from_string(const char *p_for_info,
 #endif
 
     snprintf(commandline, 1024,
-             "$PTXAS_CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  "
+             "$PTXAS_CUDA_INSTALL_PATH/bin/ptxas -w %s -v %s --output-file  "
              "/dev/null 2> %s",
              extra_flags, fname2, tempfile_ptxinfo);
     printf("GPGPU-Sim PTX: generating ptxinfo using \"%s\"\n", commandline);
@@ -460,7 +460,7 @@ void gpgpu_context::gpgpu_ptxinfo_load_from_string(const char *p_for_info,
 
         fix_duplicate_errors(fname2);
         snprintf(commandline, 1024,
-                 "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  "
+                 "$CUDA_INSTALL_PATH/bin/ptxas -w %s -v %s --output-file  "
                  "/dev/null 2> %s",
                  extra_flags, fname2, tempfile_ptxinfo);
         printf("GPGPU-Sim PTX: regenerating ptxinfo using \"%s\"\n",
@@ -524,7 +524,7 @@ void gpgpu_context::gpgpu_ptxinfo_load_from_string(const char *p_for_info,
 
     snprintf(
         commandline, 1024,
-        "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  /dev/null 2> %s",
+        "$CUDA_INSTALL_PATH/bin/ptxas -w %s -v %s --output-file  /dev/null 2> %s",
         extra_flags, fname2, tempfile_ptxinfo);
     printf("GPGPU-Sim PTX: generating ptxinfo using \"%s\"\n", commandline);
     fflush(stdout);
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 272e60e9f..ab5a592e0 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -163,8 +163,8 @@ class memory_partition_unit {
   fifo_pipeline<mem_fetch> *m_mee_dram_queue[5]; 
   fifo_pipeline<mem_fetch> *m_dram_mee_queue[5]; 
   unsigned m_n_mf[5] = {0, 0, 0, 0, 0};
-  const unsigned send_trigger_threshold = 16;
-  const unsigned receive_stop_threshold = 16;
+  const unsigned send_trigger_threshold = 64;
+  const unsigned receive_stop_threshold = 64;
   unsigned last_send = 0;
   // fifo_pipeline<mem_fetch> *m_NORM_dram_queue; 
   // fifo_pipeline<mem_fetch> *m_CTR_dram_queue; 

From 2fb389ffb39796ec25757c45751718efba2971e0 Mon Sep 17 00:00:00 2001
From: zhangqr <70464752@qq.com>
Date: Thu, 26 Dec 2024 10:52:25 +0800
Subject: [PATCH 133/133] mee v1.4.0

---
 src/abstract_hardware_model.h |  2 ++
 src/gpgpu-sim/gpu-sim.cc      | 21 +++++++++++++++++++++
 src/gpgpu-sim/gpu-sim.h       |  4 ++++
 src/gpgpu-sim/l2cache.cc      |  2 ++
 src/gpgpu-sim/l2cache.h       |  2 ++
 src/gpgpu-sim/mee.cc          | 10 +++++++---
 src/gpgpu-sim/mee.h           |  5 +++++
 7 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index e29c7b5d3..7ffc13940 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1495,6 +1495,8 @@ class register_set {
   const char *m_name;
 };
 
+typedef std::map<unsigned, short> counterMap;
+
 #endif  // #ifdef __cplusplus
 
 #endif  // #ifndef ABSTRACT_HARDWARE_MODEL_INCLUDED
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index f3554dd47..4321a55c7 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1443,6 +1443,26 @@ void gpgpu_sim::gpu_print_METACache_data_type_breakdown() {
   printf("m_cache_tot_meta_wb = %lld\n", m_cache_tot_meta_wb);
 
 }
+void gpgpu_sim::gpu_print_ctrModCount_breakdown() {
+  printf("\n========= ctr modification Count breakdown =========\n");
+
+  int ctrModificationCountBreakdown[20];
+  memset(ctrModificationCountBreakdown, 0, sizeof(ctrModificationCountBreakdown));
+  counterMap *m_count;
+  counterMap::iterator it;
+
+  for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+    m_count = m_memory_partition_unit[i]->get_ctrModificationCount();
+    
+    for (it = m_count->begin(); it != m_count->end(); it++) {
+      ctrModificationCountBreakdown[max(0, (int)floor(log2(it->second)))]++;// - 6
+    }
+  }
+
+  for (int i = 0; i < 10; i++) {
+    printf("ctrModificationCountBreakdown[%d] = %d\n", 1 << (i), ctrModificationCountBreakdown[i]); // + 7
+  }
+}
 
 void gpgpu_sim::gpu_print_stat() {
   FILE *statfout = stdout;
@@ -1603,6 +1623,7 @@ void gpgpu_sim::gpu_print_stat() {
   
   // mf data type breakdown
   gpu_print_METACache_data_type_breakdown();
+  gpu_print_ctrModCount_breakdown();
 
   if (m_config.gpgpu_cflog_interval != 0) {
     spill_log_to_file(stdout, 1, gpu_sim_cycle);
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index d2f5e16eb..d2784b98f 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -574,6 +574,7 @@ class gpgpu_sim : public gpgpu_t {
   const gpgpu_sim_config &get_config() const { return m_config; }
   void gpu_print_METACache_stat(char META[]);
   void gpu_print_METACache_data_type_breakdown();
+  void gpu_print_ctrModCount_breakdown();
   void gpu_print_stat();
   void dump_pipeline(int mask, int s, int m) const;
 
@@ -728,6 +729,9 @@ class gpgpu_sim : public gpgpu_t {
     m_functional_sim = false;
     m_functional_sim_kernel = NULL;
   }
+
+  typedef std::map<unsigned, short> Count;
+  
 };
 
 class exec_gpgpu_sim : public gpgpu_sim {
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 7e4d9a605..e3a41d5eb 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -570,6 +570,8 @@ void memory_partition_unit::get_METAcache_sub_stats(
   }
 }
 
+counterMap *memory_partition_unit::get_ctrModificationCount() { return m_mee->get_ctrModCount(); }
+
 
 memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
                                            const memory_config *config,
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index ab5a592e0..74bd45186 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -217,6 +217,8 @@ class memory_partition_unit {
 
   class gpgpu_sim *m_gpu;
 
+ public:
+  counterMap *get_ctrModificationCount();
   friend class mee;
 };
 
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
index d60d7e56d..f0fed8f8d 100644
--- a/src/gpgpu-sim/mee.cc
+++ b/src/gpgpu-sim/mee.cc
@@ -31,6 +31,8 @@ mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class me
     m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
     m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
 
+    m_ctrModCount = new counterMap;
+
     BMT_busy = false;
 }
 int decode(int addr) {
@@ -114,6 +116,8 @@ void mee::gen_CTR_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned
 
     new_addr_type CTR_addr  = get_addr(sub_partition_id, partition_addr);
     CTR_addr |= CTR_base;
+    if (wr)
+        (*m_ctrModCount)[CTR_addr]++;
 
     // if (meta_acc == META_ACC && res)
     //     size <<= 1;
@@ -127,7 +131,7 @@ void mee::gen_MAC_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned
     new_addr_type partition_addr = get_partition_addr(mf);
     new_addr_type sub_partition_id = get_sub_partition_id(mf);
     if (m_config->m_META_config.m_cache_type == SECTOR)
-        partition_addr = partition_addr >> 5 << 1;
+        partition_addr = partition_addr >> 6 << 2;
     else
         partition_addr = partition_addr >> 7 << 3;
     new_addr_type MAC_addr  = get_addr(sub_partition_id, partition_addr);
@@ -803,7 +807,7 @@ void mee::simple_cycle(unsigned cycle) {
 
                 #ifdef MAC_Enable
                 if (m_config->m_META_config.m_cache_type == SECTOR)
-                    gen_MAC_mf(mf, true, META_ACC, 2, mf_counter);
+                    gen_MAC_mf(mf, true, META_ACC, 4, mf_counter);
                 else
                     gen_MAC_mf(mf, true, META_ACC, 8, mf_counter);
                 #endif
@@ -829,7 +833,7 @@ void mee::simple_cycle(unsigned cycle) {
                 // gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);
                 #ifdef MAC_Enable
                 if (m_config->m_META_config.m_cache_type == SECTOR)
-                    gen_MAC_mf(mf, false, META_ACC, 2, mf_counter);
+                    gen_MAC_mf(mf, false, META_ACC, 4, mf_counter);
                 else
                     gen_MAC_mf(mf, false, META_ACC, 8, mf_counter);
                 #endif
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
index 1272dd830..909e77a68 100644
--- a/src/gpgpu-sim/mee.h
+++ b/src/gpgpu-sim/mee.h
@@ -126,4 +126,9 @@ class mee {
         int var;
         unsigned DL_CNT = 0;
 
+        
+    
+    public:
+        counterMap *m_ctrModCount;
+        counterMap* get_ctrModCount() { return m_ctrModCount; }
 };
\ No newline at end of file